diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..fd3a6257b65ba0c3ce37c751d3703f362ab16304 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,294 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0031.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0081.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0123.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0155.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0216.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0277.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0015.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0047.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0051.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0054.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0088.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0250.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0009.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0089.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0117.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0241.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0101.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0110.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0208.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0226.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0284.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0060.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0252.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0058.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0099.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0195.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0057.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0105.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0125.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0169.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0184.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0196.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0075.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0236.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0276.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0006.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0156.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0082.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0106.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0157.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0188.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0201.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0225.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0248.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0023.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0116.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0119.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0254.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0278.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0045.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0093.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0182.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0064.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0094.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0104.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0113.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0150.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0189.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0220.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0261.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0011.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0048.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0288.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0034.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0108.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0214.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0287.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0100.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0198.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0227.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0244.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0245.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0270.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0039.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0055.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0086.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0174.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0181.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0266.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0283.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0073.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0080.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0274.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0279.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0036.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0050.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0069.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0053.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0056.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0145.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0027.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0067.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0079.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0013.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0072.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0191.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0263.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0268.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0041.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0136.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0170.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0180.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0200.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0217.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0280.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0016.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0018.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0062.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0122.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0147.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0265.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0215.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0133.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0165.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0166.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0222.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0078.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0171.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0219.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0028.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0107.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0144.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0178.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0190.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0043.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0010.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0021.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0160.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0247.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0063.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0090.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0137.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0159.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0269.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0014.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0026.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0033.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0035.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0046.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0186.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0237.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0179.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0193.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0232.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0109.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0134.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0286.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0003.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0004.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0206.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0251.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0040.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0083.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0230.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0272.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0275.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0096.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0115.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0260.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0271.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0012.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0022.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0176.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0218.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0273.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0065.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0132.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0187.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0267.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0044.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0029.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0084.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0087.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0238.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0253.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0257.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0102.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0103.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0148.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0242.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0258.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0005.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0008.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0032.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0037.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0070.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0207.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0235.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0061.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0068.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0077.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0204.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0239.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0255.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0289.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0025.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0052.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0066.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0131.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0163.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0259.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0224.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0249.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0121.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0140.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0143.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0151.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0095.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0111.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0139.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0211.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0019.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0076.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0152.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0212.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0223.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0017.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0142.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0158.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0233.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0256.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0262.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0282.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0020.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0024.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0199.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0264.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0002.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0092.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0120.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0071.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0074.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0203.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0285.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0085.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0127.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0185.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0281.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0098.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0112.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0141.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0146.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0164.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0240.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0246.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0097.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0149.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0162.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0030.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0049.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0177.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0209.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0213.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0059.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0091.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0129.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0172.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0175.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0183.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0194.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0231.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0001.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0130.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0168.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0202.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0210.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0234.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0038.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0042.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0114.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0124.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0138.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0153.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0154.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0161.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0173.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0221.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0229.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0118.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0126.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0135.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0167.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0192.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0290.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0007.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0128.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0197.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0243.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0205.pdf filter=lfs diff=lfs merge=lfs -text
+assets/pdfs/pg_0228.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/OCR_directory.sh b/OCR_directory.sh
new file mode 100644
index 0000000000000000000000000000000000000000..48fa92b71c41dfc9e0269f303709dd0bfc17fda6
--- /dev/null
+++ b/OCR_directory.sh
@@ -0,0 +1,17 @@
+# pdftk thesis.pdf burst 
+
+#using pdf2text, extract text for each page in assets/pdfs and store in asssets/txts with similar basename
+
+for pdf in assets/pdfs/*.pdf
+do
+    echo
+    #pdftotext $pdf assets/txts/$(basename $pdf .pdf).txt
+    #pdf2txt.py -o assets/txts/$(basename $pdf .pdf).txt $pdf
+done
+
+
+for pdf in assets/pdfs/*.pdf
+do
+    convert -density 100 -quality 100 -colorspace RGB -alpha remove -alpha off $pdf assets/pngs/$(basename $pdf .pdf).png
+done
+
diff --git a/app.py b/app.py
index cbfee3ba4892f516b6e52d77f1ad51e7ccf5fe86..c73256166c40d3e087d7583ffc36ee4cf7370b8f 100644
--- a/app.py
+++ b/app.py
@@ -1,67 +1,114 @@
-import streamlit as st
-from llama_index import VectorStoreIndex
-from llama_index import ServiceContext
-from llama_index.embeddings import HuggingFaceEmbedding
-from llama_index.llms import HuggingFaceInferenceAPI
-from llama_index.schema import Document
-from PyPDF2 import PdfReader
-
-# Streamlit title and description
-st.title("PDF querying using Llama-Index by Rahul Bhoyar")
-st.write("Base Model: **HuggingFaceH4/zephyr-7b-alpha (open-source from HuggingFace)**")
-st.write("Embedding Model: **WhereIsAI/UAE-Large-V1 (open-source from HuggingFace)**")
-st.write("This app allows you to upload your own PDF and query your document.")
-
-hf_token = st.text_input("Enter your Hugging Face token:")
-
-
-def read_pdf(uploaded_file):
-    pdf_reader = PdfReader(uploaded_file)
-    text = ""
-    for page_num in range(len(pdf_reader.pages)):
-        text += pdf_reader.pages[page_num].extract_text()
-    return text
-
-
-# Streamlit input for user file upload
-success = False
-query_engine_creation = False
-uploaded_pdf = st.file_uploader("Upload your PDF", type=['pdf'])
-
-# Load data and configure the index
-if uploaded_pdf is not None:
-    file_contents = read_pdf(uploaded_pdf)
-    documents = Document(text=file_contents)
-    documents = [documents]
-    st.success("Documents loaded successfully!")
-    
-    model = st.selectbox('Select the model', ('google/flan-t5-xxl','HuggingFaceH4/zephyr-7b-alpha'), index=0)
-    llm = HuggingFaceInferenceAPI(model_name=model, token=hf_token)
-
-    with st.spinner('Creating Vector Embeddings...'):
-        embed_model_uae = HuggingFaceEmbedding(model_name="WhereIsAI/UAE-Large-V1")
-        service_context = ServiceContext.from_defaults(
-            llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embed_model_uae
-        )
-        index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True)
-        index.storage_context.persist()
-        query_engine = index.as_query_engine()
-        query_engine_creation = True
-        # Display the result of the task
-    st.success("Vector embeddings created.")
-    success = True
-else:
-    st.write("Please upload a file first.")
-
-if query_engine_creation:
-
-    # Streamlit input for user query
-    if success:
-        user_query = st.text_input("Enter your query:")
-
-        # Query engine with user input
-        if user_query:
-            with st.spinner('Fetching the response...'):
-                response = query_engine.query(user_query)
-
-            st.markdown(f"**Response:** {response}")
+import torch
+from transformers import BitsAndBytesConfig
+from llama_index.llms.huggingface import HuggingFaceLLM
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core import VectorStoreIndex, SummaryIndex
+from llama_index.core.prompts import PromptTemplate
+from llama_index.core import Settings
+
+
+import gradio as gr
+
+
+def messages_to_prompt(messages):
+    prompt = ""
+    for message in messages:
+        if message.role == "system":
+            m = "You are an expert in the research field of document understanding, bayesian deep learning and neural networks."
+            prompt += f"<|system|>\n{m}</s>\n"
+        elif message.role == "user":
+            prompt += f"<|user|>\n{message.content}</s>\n"
+        elif message.role == "assistant":
+            prompt += f"<|assistant|>\n{message.content}</s>\n"
+
+    # ensure we start with a system prompt, insert blank if needed
+    if not prompt.startswith("<|system|>\n"):
+        prompt = "<|system|>\n</s>\n" + prompt
+
+    # add final assistant prompt
+    prompt = prompt + "<|assistant|>\n"
+
+    return prompt
+
+
+def load_RAG_pipeline():
+    # LLM
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+    )
+
+    llm = HuggingFaceLLM(
+        model_name="HuggingFaceH4/zephyr-7b-alpha",
+        tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
+        query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
+        context_window=3900,
+        max_new_tokens=256,
+        model_kwargs={"quantization_config": quantization_config},
+        # tokenizer_kwargs={},
+        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
+        messages_to_prompt=messages_to_prompt,
+        device_map="auto",
+    )
+
+    # Llama-index
+    Settings.llm = llm
+    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
+    # Settings.chunk_size = 512
+    # Settings.chunk_overlap = 50
+
+    # raw data
+    documents = SimpleDirectoryReader("assets/txts").load_data()
+    vector_index = VectorStoreIndex.from_documents(documents)
+    # summary_index = SummaryIndex.from_documents(documents)
+    query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=3)
+    return query_engine
+
+
+query_engine = load_RAG_pipeline()
+
+
+# These are placeholder functions to simulate the behavior of the RAG setup.
+# You would need to implement these with the actual logic to retrieve and generate answers based on the document.
+def get_answer(question, temperature, nucleus_sampling, max_tokens):
+    # Here you should implement the logic to generate an answer based on the question and the document.
+    # For example, you could use a machine learning model for RAG.
+    # answer = "This is a placeholder answer."
+    # https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
+    return query_engine.query(question)
+
+
+def get_answer_page(question):
+    # Implement logic to retrieve the page number or an image of the page with the answer.
+    answer_page = "Page X - placeholder image."
+    return answer_page
+
+
+# Create the gr.Interface function
+def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
+    answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
+    answer_page = get_answer_page(question)
+    return answer, answer_page
+
+
+# Set up the interface options based on the design in the image.
+iface = gr.Interface(
+    fn=ask_my_thesis,
+    inputs=[
+        gr.Textbox(label="Question", placeholder="Type your question here..."),
+        gr.Slider(0, 1, value=0.7, label="Temperature"),
+        gr.Slider(0, 1, value=0.9, label="Nucleus Sampling"),
+        gr.Slider(1, 500, value=100, label="Max Generated Number of Tokens"),
+    ],
+    outputs=[gr.Textbox(label="Answer"), gr.Image(label="Answer Page")],
+    title="Ask my thesis",
+    description="Chat with the manuscript: ask questions and receive answers with references.",
+    allow_flagging="never",
+)
+
+# Start the application.
+if __name__ == "__main__":
+    iface.launch()
diff --git a/assets/txts/pg_0002.txt b/assets/txts/pg_0002.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8214d0ee079917c29e57d16e764fc46de8fb50bf
--- /dev/null
+++ b/assets/txts/pg_0002.txt
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/assets/txts/pg_0003.txt b/assets/txts/pg_0003.txt
new file mode 100644
index 0000000000000000000000000000000000000000..83e60fa37e6fcb448f5b9ea80c6f69fe266013cf
--- /dev/null
+++ b/assets/txts/pg_0003.txt
@@ -0,0 +1,25 @@
+Intelligent Automation for AI-Driven Document
+Understanding
+
+Jordy VAN LANDEGHEM
+
+Examination committee:
+em. Prof. Dr. ir. Jean-Pierre Celis, chair
+Prof. Dr. Marie-Francine Moens, supervisor
+Prof. Dr. Matthew B. Blaschko, supervisor
+Prof. Dr. ir. Johan Suykens
+Prof. Dr. ir. Tinne Tuytelaars
+Prof. Dr. Marcus Rohrbach
+(TU Darmstadt)
+Prof. Dr. Wenpeng Yin
+(Penn State University)
+Dr. Bertrand Anckaert
+(Contract.fit)
+March 2024
+
+Dissertation presented in partial
+fulfillment of the requirements for
+the degree of Doctor of Engineering
+Science (PhD): Computer Science
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0004.txt b/assets/txts/pg_0004.txt
new file mode 100644
index 0000000000000000000000000000000000000000..74e9321bf34d335608e0753358c62f97ea7559ac
--- /dev/null
+++ b/assets/txts/pg_0004.txt
@@ -0,0 +1,10 @@
+© 2024 KU Leuven – Faculty of Engineering Science
+Uitgegeven in eigen beheer, Jordy Van Landeghem, Celestijnenlaan 200A box 2402, B-3001 Leuven (Belgium)
+
+Alle rechten voorbehouden. Niets uit deze uitgave mag worden vermenigvuldigd en/of openbaar gemaakt worden
+door middel van druk, fotokopie, microfilm, elektronisch of op welke andere wijze ook zonder voorafgaande
+schriftelijke toestemming van de uitgever.
+All rights reserved. No part of the publication may be reproduced in any form by print, photoprint, microfilm,
+electronic or any other means without written permission from the publisher.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0005.txt b/assets/txts/pg_0005.txt
new file mode 100644
index 0000000000000000000000000000000000000000..92cd5a79125a60f2b827b17c151c47cf1b15932d
--- /dev/null
+++ b/assets/txts/pg_0005.txt
@@ -0,0 +1,32 @@
+Preface
+This journey has been long and arduous, but I have finally reached an end. At
+this end, I have a thesis that I am proud of, and I have learned a lot. As I look
+back, I have been very fortunate to have had the support of many people, and I
+would like to take this opportunity to thank them.
+First and foremost, I would like to thank my supervisors, Sien and Matthew,
+for their guidance and support throughout this journey. Sien has taught me
+the importance of being thorough and meticulous, striving for diligence and
+perfection from the get-go. I still remember how patiently she helped me with
+my first paper, holding a Sunday afternoon call from her attic/home-office,
+helping me hone the presentation and writing. Involving Matthew as the cosupervisor has been the best decision for my personal development, as he offered
+a different perspective on my work, always challenging me to look at problems
+from the lens of statistical theory and machine learning fundamentals. My
+knee-jerk reaction to start implementing things as soon as possible was often
+met with a “slow down, think about it first” from Matthew, which has been
+invaluable in my development as a researcher. I am grateful to both of them
+for their patience and understanding, and for giving me the freedom to explore
+my own ideas and interests.
+Next, a sincere thanks to my jury members, for taking the time to read my
+thesis and for their valuable feedback. Furthermore, I would like to thank
+het Vlaams Agentschap Innoveren & Ondernemen (VLAIO) for awarding the
+Baekeland grant without which this PhD would not have been possible.
+Pol & Bertrand, thanks for having me contribute to your dream to rid the
+world of boring administrative processes and paperwork. Technically my bosses,
+but in reality you are the embodiment of leadership by example, and I am
+grateful for the many lessons I have learned from you. I am grateful for the
+many opportunities you have given me to grow as a researcher and as a person.
+Many thanks to my past and present colleagues at Contract.fit, for always
+
+i
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0006.txt b/assets/txts/pg_0006.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d3d5790043833b5e7d169c207f84dfd98dc4de22
--- /dev/null
+++ b/assets/txts/pg_0006.txt
@@ -0,0 +1,45 @@
+ii
+
+PREFACE
+
+preaching automation, inspiring me, and for having fun along the way. I am
+grateful to my LIIR colleagues at KU Leuven, particularly the folks from office
+4.34 for the many interesting discussions and whiteboard sessions, whenever I
+occasionally popped into the office.
+I was fortunate to travel to many places during my PhD (Lausanne, Lisbon,
+Barcelona, San Jose, Paris, Waikoloa), and I have met many people along the
+way. My DUDEs, you have been the trigger to complete my PhD, reinvigorating
+my passion for research and inspiring me for my future career. How crazy is it
+that we conceived the seeds of the DUDE
+project in a pirates bar, on a
+hotel rooftop, and from a hospital bed after my back surgery?
+Finally, I would like to thank my family and friends for their support and
+encouragement throughout this journey. My parents, Peter en Nadine, you
+have showed me that hard work pays off, and merci for the many sacrifices you
+have made to give me the best possible education and life. Marijke, you are
+the love of my life, and although I am not religious, you are my goddess, de
+mammiej. Feliz, when you came into our lives, you added an extra dimension.
+I used to see in 2D, now I see in 3D. Forever your father, your pappiej. Wes en
+Jen, thanks for showing me to never give up, keep on pushing, even when you
+are at your lowest, there is a way out, and only hard work will get you there.
+Cornbois -Bryan, Emile, (even) Jan, for our friendship, I fail to make an
+exhaustive definition. I wish for many more years of friendship from my likeminded brothers. John, Teunen, Wannes, if there is ever a zombie apocalypse, I
+know that I can count on you to have my window. Kessel-city - Poohke, Vinny,
+Kweinch etc., thanks for keeping on pushing the bar higher, and inspiring me
+with your ambition and drive. Gustaf, thanks for the many laughs (#velleke)
+and the much-needed distraction. Elstipoes, you are my oldest friend, and I am
+grateful for the many years of friendship. Woutje, thanks for your contagious
+optimism and the mancave during university. Leuvenbende, you were the
+ones that made university fun and enjoyable. Individually and together you are
+beautiful people, and I cherish our yearly reunions. Lauren en Yannick, thanks
+for letting me win at Mario Kart. I might be forgetting some people, but I
+would like to thank all my friends for bringing joy, for keeping me grounded,
+and for reminding me that there is more to life than work.
+Having studied literature in my Bachelor’s, it feels appropriate to finish with a
+quote wrongly attributed to Ernest Hemingway: “Write drunk; edit sober.”
+Jordy Van Landeghem
+Gurdo, Pogomeister, Jorre, De Van Laaandeghem
+February, 2024
+Kessel, Belgium
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0007.txt b/assets/txts/pg_0007.txt
new file mode 100644
index 0000000000000000000000000000000000000000..270758d7b3a3540df394e67d4f430799543303da
--- /dev/null
+++ b/assets/txts/pg_0007.txt
@@ -0,0 +1,33 @@
+Abstract
+Human communication is increasingly document-based, requiring machines
+to understand a wide variety of visually-rich documents to assist humans in
+their daily lives. Amid the digital evolution, documents continue to facilitate
+crucial human and organizational interactions but are tethered to manual
+processing, causing inefficiency. We examine why organizations lag in adopting
+automated document processing solutions and outline two primary challenges:
+the complexity of processing long, multimodal documents algorithmically and
+the necessity for reliability and control over associated risks. Automated decisionmaking is key to improving the efficiency of document processing, but the current
+state-of-the-art technology is not yet reliable and robust enough to be deployed
+in autonomous systems.
+The practical objective set is to develop Intelligent Automation () systems
+capable of estimating confidence in their actions, thereby increasing throughput
+without accruing additional costs due to errors. We analyze the key challenges
+and propose solutions to bridge the gap between research and practical
+applications, with a focus on realistic datasets and experimental methodologies.
+Building upon foundations of Document Understanding (), this dissertation
+introduces advanced methodologies combining Machine Learning, Natural
+Language Processing, and Computer Vision.
+Addressing the evident gaps in research, this work presents novel methods
+for predictive uncertainty quantification () alongside practical frameworks for
+evaluating the robustness and reliability of DU technologies. The contribution
+culminates in the introduction of two novel multipage document classification
+datasets and a multifaceted benchmark, DUDE
+, designed to rigorously
+challenge and assess the state-of-the-art in DU. Extensive experiments across
+these datasets reveal that while advancements have been made, significant
+room for improvement remains, particularly in long-context modeling for
+multipage document processing and calibrated, selective document visual
+question answering. Efficient DU is also explored, revealing the effectiveness of
+iii
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0008.txt b/assets/txts/pg_0008.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8fcb7c74fe32fe733c89c3782a15c5c00fabb6c3
--- /dev/null
+++ b/assets/txts/pg_0008.txt
@@ -0,0 +1,35 @@
+iv
+
+ABSTRACT
+
+knowledge distillation () model compression in visually-rich document layout
+analysis () and classification.
+Through empirical studies and methodological contributions, this dissertation
+has the following contributions and findings:
+First, in a benchmarking study of established methods on real-world text
+classification, we find that our novel hybrid method ‘Concrete Dropout
+Ensemble’ performs best, enhancing in-domain calibration and novel class
+detection, even at a smaller ensemble size. Detailed ablation experiments
+reveal the impact of prior, neural architecture, and hyperparameter choices on
+estimation quality.
+Second, on a prototypical DU task, we identify challenges in DU progress
+and propose a formalization of multipage document classification scenarios,
+constructed novel datasets, and conducted an experimental analysis showing
+the promise of multipage representation learning and inference.
+Third, we introduce DUDE, incorporating multifaceted challenges and principles
+for a comprehensive evaluation of generic DU. Next to our own benchmarking,
+we organize a competition, revealing that while newer document foundation
+models show promise, they struggle with questions involving visual evidence or
+complex reasoning. Moreover, we find severe problems in the ability of Large
+Language Models (s) to reason about documents in their entirety, highlighting
+issues with hallucination, long-context reasoning and control.
+Fourth, we propose the first methodology for enriching documents with semantic
+layout structure using distilled DLA models. We apply KD to visual document
+tasks, unraveling the influence of various task and architecture components.
+Finally, the dissertation concludes with a discussion of the findings and
+implications for future research, emphasizing the need for advancements in
+multipage document representation learning and the importance of realistic
+datasets and experimental methodologies to measurably move forward to reliable
+and robust IA-DU technology.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0009.txt b/assets/txts/pg_0009.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e510cdecf65e2ade2cee9d25ddae51979c3cad8f
--- /dev/null
+++ b/assets/txts/pg_0009.txt
@@ -0,0 +1,34 @@
+Beknopte samenvatting
+Menselijke communicatie is in toenemende mate documentgebaseerd, waarbij
+machines een breed aanbod aan visueel-rijke documenten moeten begrijpen
+om mensen in hun dagelijks leven te assisteren. Te midden van de digitale
+evolutie blijven documenten cruciale menselijke en organisatorische interacties
+faciliteren, maar zijn ze gebonden aan handmatige verwerking, wat inefficiëntie
+veroorzaakt. We onderzoeken waarom organisaties achterblijven bij het
+adopteren van geautomatiseerde documentverwerkingsoplossingen en schetsen
+twee primaire uitdagingen: de complexiteit van het algoritmisch verwerken van
+lange, multimodale documenten en de noodzaak van betrouwbaarheid en controle
+over daarmee samenhangende risico’s. Geautomatiseerde besluitvorming is
+essentieel voor het verbeteren van de efficiëntie van documentverwerking, maar
+de huidige stand van de technologie is nog niet betrouwbaar en robuust genoeg
+om ingezet te worden in autonome toepassingen.
+Het praktische doel dat gesteld wordt, is het ontwikkelen van systemen voor
+Intelligente Automatisering (IA) die in staat zijn om vertrouwen in hun acties te
+schatten, daarmee de doorvoer verhogend zonder extra kosten vanwege fouten.
+We analyseren de belangrijkste uitdagingen en stellen oplossingen voor om de
+kloof tussen onderzoek en praktische toepassingen te overbruggen, met een focus
+op realistische datasets en experimentele methodologieën. Voortbouwend op
+de fundamenten van Documentinterpretatie (DI), introduceert dit proefschrift
+geavanceerde methodologieën die Machinaal Leren, Natuurlijke Taalverwerking
+en Computer Visie combineren.
+Door de duidelijke hiaten in onderzoek aan te pakken, presenteert dit werk
+nieuwe methoden voor predictieve onzekerheidskwantificering (POK) naast
+praktische kaders voor het evalueren van de robuustheid en betrouwbaarheid
+van DI-technologieën. De bijdrage culmineert in de introductie van twee
+nieuwe datasets voor classificatie van multipagina documenten en een veelzijdige
+benchmark, DUDE
+, ontworpen om de state-of-the-art in DI rigoureus
+uit te dagen en te beoordelen. Uitgebreide experimenten met deze datasets
+v
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0010.txt b/assets/txts/pg_0010.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4550822d7849f128d14da308982dbac3ba22a0b0
--- /dev/null
+++ b/assets/txts/pg_0010.txt
@@ -0,0 +1,44 @@
+vi
+
+BEKNOPTE SAMENVATTING
+
+onthullen dat er weliswaar vooruitgang is geboekt, maar dat er nog significant
+veel ruimte is voor verbetering, met name in de lange-contextmodellering voor
+de verwerking van multipagina documenten en gekalibreerd, selectief visueel
+vraagbeantwoording van documenten. Meer schaalbaar DI wordt ook verkend,
+waarbij de effectiviteit van kennisdistillatie (KD) voor modelcompressie in
+visueel-rijke layoutanalyse (DLA) en classificatie van documenten aan het licht
+komt.
+Door middel van empirische studies en methodologische bijdragen, heeft dit
+proefschrift de volgende bijdragen en bevindingen:
+Ten eerste vinden we in een benchmarkstudie van gevestigde POK-methoden
+op tekstclassificatie in de echte wereld dat onze nieuwe hybride POK-methode
+’Concrete Dropout Ensemble’ het beste presteert, de kalibratie binnenshuis
+verbeterend en detectie van nieuwe klassen, zelfs met een kleiner ensemble.
+Gedetailleerde ablatie-experimenten onthullen de impact van voorafgaande
+kennis, neurale architectuur en keuzes van hyperparameters op de kwaliteit van
+POK-schatting.
+Ten tweede identificeren we uitdagingen in de vooruitgang van DI en stellen een
+formalisatie voor van multipagina documentclassificatiescenario’s, bouwen novel
+datasets, en voeren een experimentele analyse uit die de belofte van multipagina
+representatie-leren en inferentie toont.
+Ten derde introduceren we DUDE, waarin veelzijdige uitdagingen en principes
+worden voorgesteld voor een uitgebreide evaluatie.
+Naast onze eigen
+benchmarking organiseren we een competitie, waaruit blijkt dat hoewel nieuwere
+modellen veelbelovend zijn, ze het moeilijk hebben met vragen die visueel bewijs
+of complex redeneren vereisen. Bovendien vinden we ernstige problemen in het
+vermogen van Grote Taalmodellen (LLMs) om over documenten in hun geheel
+te redeneren, wat problemen benadrukt met hallucinatie, redeneren met lange
+context en controle.
+Ten vierde stellen we de eerste experimentele methodologie voor om documenten
+te verrijken met semantische layoutstructuur met behulp van gedestilleerde
+DLA-modellen. We passen KD toe op visuele documenttaken, waarbij we de
+invloed van verschillende architectuurcomponenten van taken ontrafelen.
+Ten slotte sluit het proefschrift af met een bespreking van de bevindingen en
+implicaties voor toekomstig onderzoek, waarbij de noodzaak wordt benadrukt
+voor vooruitgang in multipagina documentrepresentatie-leren en het belang van
+realistische datasets en experimentele methodologieën om meetbaar vooruitgang
+te boeken naar betrouwbare en robuuste IA-DI technologie.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0013.txt b/assets/txts/pg_0013.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2589af835d77d115f558fc3b7c2dafd6fd1abf18
--- /dev/null
+++ b/assets/txts/pg_0013.txt
@@ -0,0 +1,22 @@
+List of Abbreviations
+AAPD Arxiv Academic Paper Dataset
+Acc_ID Accuracy in-domain
+Acc_OOD Accuracy out of domain
+AI Artificial Intelligence
+ANLS Average Normalized Levenshtein Similarity
+AUPR Area Under the Precision-Recall Curve
+AURC Area-Under-Risk-Coverage-Curve
+AUROC Area Under the Receiver Operating Characteristic curve
+BDL Bayesian Deep Learning
+BNN Bayesian Neural Network
+BPM Business Process Management
+CE Cross-Entropy
+CER Character Error Rate
+COCO Common Objects in Context
+CSF Confidence Scoring Function
+CV Computer Vision
+DC Document Classification
+DG Document Generation
+ix
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0014.txt b/assets/txts/pg_0014.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b79bccd5dd81bba4301749b161306cdf80403810
--- /dev/null
+++ b/assets/txts/pg_0014.txt
@@ -0,0 +1,30 @@
+x
+
+List of Abbreviations
+
+DL Deep Learning
+DLA Document Layout Analysis
+DNN Deep Neural Network
+DocAI Document AI
+DocVQA Document Visual Question Answering
+DOD Document Object Detection
+DU Document Understanding
+DUDE Document UnderstanDing of Everything
+ECE Expected Calibration Error
+ELBO Evidence Lower Bound
+ERM Empirical Risk Minimization
+FasterRCNN Faster Region-based Convolutional Neural Network
+FP False Positives
+IA Intelligent Automation
+ICDAR International Conference on Document Analysis and Recognition
+IDP Intelligent Document Processing
+i.i.d. Independent and Identically Distributed
+IOB/IOBES Inside, Outside, Beginning / End, Single
+KD Knowledge Distillation
+KIE Key Information Extraction
+LLM Large Language Model
+MAP Maximum-a-Posteriori
+mAP Mean Average Precision
+MCD Monte Carlo Dropout
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0015.txt b/assets/txts/pg_0015.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2c6f921d4e7c76d2c47d65bafbf05adc7e167ec3
--- /dev/null
+++ b/assets/txts/pg_0015.txt
@@ -0,0 +1,30 @@
+List of Abbreviations
+
+MCMC Markov Chain Monte-Carlo
+MDLT Multi-Domain Long-Tailed Recognition
+MECE Mutually Exclusive and Collectively Exhaustive
+MI Mutual Information
+ML Machine Learning
+MSE Mean Squared Error
+MSP Maximum Softmax Probability
+MU Model Uncertainty
+NLG Natural Language Generation
+NLL Negative Log Likelihood
+NLP Natural Language Processing
+NN Neural Network
+OCR Optical Character Recognition
+OOD Out-of-Distribution
+PCC Pearson Correlation Coefficient
+PUQ Predictive Uncertainty Quantification
+RERM Regularized Empirical Risk Minimization
+ResNet Residual Network
+RPA Robotic Process Automation
+SaaS Software-as-a-service
+SNGP Spectral-normalized Neural Gaussian Process
+SOTA State-of-the-art
+STP Straight-Through-Processing
+TSR Table Structure Recognition
+
+xi
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0016.txt b/assets/txts/pg_0016.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ed6aa887a225e20ba41d199e062db8224ea88510
--- /dev/null
+++ b/assets/txts/pg_0016.txt
@@ -0,0 +1,12 @@
+xii
+
+VDU Visual Document Understanding
+VI Variational Inference
+VLM Vision Language Model
+VQA Visual Question Answering
+VRD Visually-Rich Document
+WER Word Error Rate
+
+LIST OF ABBREVIATIONS
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0017.txt b/assets/txts/pg_0017.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fdaa9c3763438226f6545313ed513efd4e5b5fb9
--- /dev/null
+++ b/assets/txts/pg_0017.txt
@@ -0,0 +1,200 @@
+Contents
+Abstract
+
+iii
+
+Beknopte samenvatting
+
+v
+
+List of Abbreviations
+
+xii
+
+Contents
+
+xiii
+
+List of Figures
+
+xix
+
+List of Tables
+
+xxv
+
+1 Introduction
+1.1 Research Context . . . . . . . . . . . . . . . . . . . . . .
+1.2 Problem Statement and Questions . . . . . . . . . . . .
+1.2.1 Reliable and Robust Deep Learning . . . . . . .
+1.2.2 Realistic and Efficient Document Understanding
+1.3 Outline . . . . . . . . . . . . . . . . . . . . . . . . . . .
+
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+
+1
+4
+6
+6
+7
+9
+
+2 Fundamentals
+2.1 Statistical Learning . . . . . . . . . . . . . . . .
+2.1.1 Neural Networks . . . . . . . . . . . . .
+2.1.2 Probabilistic Evaluation . . . . . . . . .
+2.1.3 Architectures . . . . . . . . . . . . . . .
+2.1.3.1 Convolutional Neural Networks
+2.1.3.2 Language Neural Networks . .
+2.1.3.3 Transformer Network . . . . .
+2.2 Reliability and Robustness . . . . . . . . . . . .
+2.2.1 Generalization and Adaptation . . . . .
+2.2.2 Confidence Estimation . . . . . . . . . .
+2.2.3 Evaluation Metrics . . . . . . . . . . . .
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+11
+12
+14
+15
+16
+17
+18
+19
+21
+22
+23
+24
+
+xiii
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0018.txt b/assets/txts/pg_0018.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1b5587f64c55836f280f605559f2fbc81e8fff8d
--- /dev/null
+++ b/assets/txts/pg_0018.txt
@@ -0,0 +1,204 @@
+xiv
+
+CONTENTS
+
+2.3
+
+2.4
+
+I
+
+2.2.4 Calibration . . . . . . . . . . . . . . . .
+2.2.5 Predictive Uncertainty Quantification .
+2.2.6 Failure Prediction . . . . . . . . . . . .
+Document Understanding . . . . . . . . . . . .
+2.3.1 Task Definitions . . . . . . . . . . . . .
+2.3.2 Datasets . . . . . . . . . . . . . . . . . .
+2.3.3 Models . . . . . . . . . . . . . . . . . .
+2.3.4 Challenges in Document Understanding
+2.3.4.1 Long-Context Modeling . . . .
+2.3.4.2 Document Structure Modeling
+Intelligent Automation . . . . . . . . . . . . . .
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+Reliable and Robust Deep Learning
+
+3 Benchmarking Scalable Predictive Uncertainty in Text Classification
+3.1 Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+3.2 Related Work . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+3.3 Uncertainty Methods . . . . . . . . . . . . . . . . . . . . . . . .
+3.3.1 Quantifying Uncertainty in Deep Learning . . . . . . . .
+3.3.2 Predictive Uncertainty Methods . . . . . . . . . . . . .
+3.3.2.1 Monte Carlo Dropout . . . . . . . . . . . . . .
+3.3.2.2 Deep Ensemble . . . . . . . . . . . . . . . . . .
+3.3.2.3 Concrete Dropout . . . . . . . . . . . . . . . .
+3.3.2.4 Heteroscedastic Extensions . . . . . . . . . . .
+3.3.3 Uncertainty Estimation . . . . . . . . . . . . . . . . . .
+3.3.4 Motivating Hybrid Approaches . . . . . . . . . . . . . .
+3.3.5 Uncertainty Calibration under Distribution Shift . . . .
+3.4 Experimental Methodology . . . . . . . . . . . . . . . . . . . .
+3.4.1 Proposed Hybrid Approaches . . . . . . . . . . . . . . .
+3.4.2 Datasets . . . . . . . . . . . . . . . . . . . . . . . . . . .
+3.4.3 Architecture . . . . . . . . . . . . . . . . . . . . . . . .
+3.4.4 Evaluation metrics . . . . . . . . . . . . . . . . . . . . .
+3.4.5 Experimental design . . . . . . . . . . . . . . . . . . . .
+3.4.5.1 In-domain Setting . . . . . . . . . . . . . . . .
+3.4.5.2 Cross-domain Setting . . . . . . . . . . . . . .
+3.4.5.3 Novelty Detection Setting . . . . . . . . . . . .
+3.5 Results . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+3.5.1 Experiment: In-domain . . . . . . . . . . . . . . . . . .
+3.5.2 Experiment: Cross-domain . . . . . . . . . . . . . . . .
+3.5.3 Experiment: Novelty Detection . . . . . . . . . . . . . .
+3.5.4 Experiment: Ablations . . . . . . . . . . . . . . . . . . .
+3.5.4.1 Diversity . . . . . . . . . . . . . . . . . . . . .
+
+28
+30
+32
+33
+35
+36
+37
+38
+39
+40
+41
+
+43
+44
+46
+48
+51
+51
+52
+53
+53
+54
+54
+55
+58
+59
+61
+61
+63
+64
+66
+66
+67
+67
+68
+69
+70
+71
+73
+75
+76
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0019.txt b/assets/txts/pg_0019.txt
new file mode 100644
index 0000000000000000000000000000000000000000..724ad053415c4d63da70b2d88aa4cd7f2cc9877e
--- /dev/null
+++ b/assets/txts/pg_0019.txt
@@ -0,0 +1,394 @@
+CONTENTS
+
+3.6
+3.7
+
+3.8
+3.9
+
+II
+
+xv
+
+3.5.4.2 NLP Architecture . . . . . . . . . .
+3.5.4.3 Ensemble size M . . . . . . . . . . .
+3.5.4.4 Concrete Dropout p . . . . . . . . .
+Discussion . . . . . . . . . . . . . . . . . . . . . . . .
+Additional Uncertainty Approaches . . . . . . . . . .
+3.7.1 Stochastic Gradient MCMC Methods . . . .
+3.7.2 Spectral-normalized Neural Gaussian Process
+3.7.2.1 SNGP Results . . . . . . . . . . . .
+3.7.2.2 SNGP Discussion . . . . . . . . . .
+Limitations . . . . . . . . . . . . . . . . . . . . . . .
+Chapter Conclusion . . . . . . . . . . . . . . . . . .
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+Realistic and Efficient Document Understanding
+
+4 Beyond Document Page Classification: Design,
+Challenges
+4.1 Introduction . . . . . . . . . . . . . . . . . . . .
+4.2 Problem Formulation . . . . . . . . . . . . . . .
+4.3 Balancing Research & Applications . . . . . . .
+4.4 Experimental Study . . . . . . . . . . . . . . .
+4.5 Challenges and Guidelines . . . . . . . . . . . .
+4.5.1 Divergence of Tasks: f . . . . . . . . . .
+4.5.2 Divergence of Label Space: Y . . . . . .
+4.5.3 Divergence of Input Data: X . . . . . .
+4.5.4 Maturity of Evaluation Methodology . .
+4.6 Chapter Conclusion . . . . . . . . . . . . . . .
+5 Document UnderstanDing of Everything (DUDE
+5.1 Introduction . . . . . . . . . . . . . . . . . . .
+5.2 Related Work . . . . . . . . . . . . . . . . . .
+5.3 DUDE Dataset . . . . . . . . . . . . . . . .
+5.3.1 Gathering Documents . . . . . . . . .
+5.3.2 Annotation Process . . . . . . . . . .
+5.3.3 Dataset Statistics . . . . . . . . . . . .
+5.3.4 Diagnostic Subsets . . . . . . . . . . .
+5.3.5 Evaluation . . . . . . . . . . . . . . .
+5.4 DUDE Competition . . . . . . . . . . . . . .
+5.4.1 Challenge Objectives . . . . . . . . . .
+5.4.2 Challenge Contributions . . . . . . . .
+5.4.3 Motivation and Scope . . . . . . . . .
+5.4.3.1 Desired Generalization. . . .
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+77
+79
+80
+81
+85
+86
+87
+88
+90
+90
+91
+
+94
+
+Datasets, and
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+95
+97
+98
+101
+104
+107
+107
+108
+109
+111
+111
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+)
+. .
+. .
+. .
+. .
+. .
+. .
+. .
+. .
+. .
+. .
+. .
+. .
+. .
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+113
+116
+117
+118
+121
+121
+123
+125
+126
+128
+128
+129
+129
+130
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0020.txt b/assets/txts/pg_0020.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f2b9d2d9eaff250d78410357af3a9b4945d5bc6a
--- /dev/null
+++ b/assets/txts/pg_0020.txt
@@ -0,0 +1,320 @@
+xvi
+
+CONTENTS
+
+5.4.4
+
+5.5
+5.6
+
+5.7
+5.8
+
+DUDE Competition Protocol . . . . . . . .
+5.4.4.1 Task Formulation . . . . . . . . . .
+5.4.4.2 Evaluation Protocol . . . . . . . . .
+DUDE Benchmark . . . . . . . . . . . . . . . . . .
+5.5.1 Baselines . . . . . . . . . . . . . . . . . . . .
+5.5.2 Analysis & Discussion . . . . . . . . . . . . .
+Detailed Results Analysis . . . . . . . . . . . . . . .
+5.6.1 Within Model Class Analysis . . . . . . . . .
+5.6.1.1 Encoder vs. Decoder . . . . . . . .
+5.6.1.2 Incorporating Layout & Vision . . .
+5.6.1.3 Toward Long Document Processing
+5.6.1.4 Diagnosis of LLM Results . . . . . .
+5.6.2 Assessing Confidence . . . . . . . . . . . . . .
+DUDE Competition Results . . . . . . . . . . . . .
+5.7.1 Submitted Methods . . . . . . . . . . . . . .
+5.7.2 Performance Analysis . . . . . . . . . . . . .
+Chapter Conclusion . . . . . . . . . . . . . . . . . .
+
+6 DistilDoc: Knowledge Distillation for Visually-Rich
+Applications
+6.1 Introduction . . . . . . . . . . . . . . . . . . . . . .
+6.2 Related Work . . . . . . . . . . . . . . . . . . . . .
+6.3 Experimental Setup . . . . . . . . . . . . . . . . .
+6.3.1 Datasets . . . . . . . . . . . . . . . . . . . .
+6.3.2 Architectures and Backbones . . . . . . . .
+6.3.3 KD Methods . . . . . . . . . . . . . . . . .
+6.3.4 Evaluation . . . . . . . . . . . . . . . . . .
+6.3.5 DLA-enriched LLM prompting . . . . . . .
+6.4 Results & Discussion . . . . . . . . . . . . . . . . .
+6.5 Chapter Conclusion . . . . . . . . . . . . . . . . .
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+131
+132
+132
+133
+133
+134
+136
+136
+136
+136
+136
+137
+138
+138
+138
+139
+144
+
+Document
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+145
+147
+149
+151
+152
+153
+155
+157
+158
+158
+163
+
+7 Conclusion
+7.1 Summary . . . . . . . . . . . . . . . . . . . . . . . . .
+7.2 Perspectives For Future Research . . . . . . . . . . . .
+7.2.1 Open Problems In Reliability & Robustness . .
+7.2.2 A Future-Proof Design Of IA-DU . . . . . . . .
+7.2.2.1 The ‘Ultimate’ DU Dataset? . . . . .
+7.2.2.2 A Feature-complete IA-DU Solution?
+
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+
+165
+165
+171
+172
+173
+173
+178
+
+Bibliography
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+181
+
+A Appendix - PUQ
+223
+A
+Implementation Details . . . . . . . . . . . . . . . . . . . . . . 223
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0021.txt b/assets/txts/pg_0021.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9e8d5f09dc4d595e82b75ae821605890ac7ea717
--- /dev/null
+++ b/assets/txts/pg_0021.txt
@@ -0,0 +1,421 @@
+CONTENTS
+
+B
+C
+
+xvii
+
+A.1
+Software and Data . . . . . . . . . .
+A.2
+Hyperparameter Defaults . . . . . .
+Practical Considerations . . . . . . . . . . .
+B.1
+Take-home Summary . . . . . . . . .
+B.2
+Compute vs. Performance Trade-off
+Detailed Experiment Results . . . . . . . .
+C.1
+Zoom-in Benchmark Evidence . . . .
+C.2
+Absolute Benchmark Results . . . .
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+
+223
+223
+224
+224
+225
+226
+226
+226
+
+B Appendix - BDPC
+230
+A
+Existing DC Datasets . . . . . . . . . . . . . . . . . . . . . . . . 230
+B
+Visualization of Proposed DC Datasets . . . . . . . . . . . . . . 231
+C Appendix - DUDE
+A
+Baseline Experiments Setup . . . . . . . . . .
+A.1
+Hyperparameter Defaults . . . . . . .
+A.2
+Generative LLM Prompt Fine-tuning
+A.3
+Confidence Estimation . . . . . . . . .
+A.4
+Evaluation . . . . . . . . . . . . . . .
+B
+Qualitative Examples . . . . . . . . . . . . .
+B.1
+Qualitative Examples - Competition .
+
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+
+232
+232
+232
+232
+233
+235
+235
+241
+
+D Appendix - KDD
+A
+Code and Datasets . . . . . . . . . . .
+B
+Implementation Details . . . . . . . .
+C
+Task Definitions . . . . . . . . . . . .
+D
+Additional Experiment Results . . . .
+D.1
+Tobacco-3482 Results . . . . .
+D.2
+PRImA Results . . . . . . . . .
+D.3
+RVL-CDIP-N Results . . . . .
+D.4
+Downstream DocVQA Results
+D.5
+Ablation Experiments . . . . .
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+244
+244
+244
+246
+247
+249
+249
+249
+249
+249
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+Curriculum
+
+253
+
+Publications
+
+255
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0033.txt b/assets/txts/pg_0033.txt
new file mode 100644
index 0000000000000000000000000000000000000000..57094c296cbbaf9ba200a76cba188d7cd8ef485d
--- /dev/null
+++ b/assets/txts/pg_0033.txt
@@ -0,0 +1,30 @@
+Chapter 1
+
+Introduction
+“yourAmid
+significant life events—like buying a house or expecting
+firstborn child—lies a less cheerful reality that I experienced
+firsthand: the hassle of dealing with manual paperwork.
+
+For the former case, this required a lot of back-and-forth with
+the bank, the notary, and the real estate agent, with each of
+them requiring a different set of documents (e.g., monthly pay
+stubs, bank statements, copies of national registry, etc.) to be
+filled in, signed, and sent back for processing.
+On the side of the document processors, each document needed
+to be classified, key information extracted, and the information
+validated against other documents to be able to prove my
+solvency in making an offer, applying for a loan, or being drafted
+as the future house owner. In between all parties and external
+organizations, even more documents were either created, adapted,
+or passed along such as the offer, the loan agreement, the deed
+of sale, a soil certificate, etc.
+This juxtaposition of valuable moments in life with cumbersome
+administrative procedures involving manual document
+processing forms the backdrop against which I aim to explore
+and propose potential solutions in this thesis.
+
+”
+1
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0034.txt b/assets/txts/pg_0034.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c9c2d135877ee5ba3d879253a8818af58f115f27
--- /dev/null
+++ b/assets/txts/pg_0034.txt
@@ -0,0 +1,44 @@
+2
+
+INTRODUCTION
+
+Documents are containers of information that are easily shareable. The concept
+of a document dates back to when humans started writing and has been a
+cornerstone of human communication ever since. In the age of digital technology,
+documents are still the primary means of communication between humans and
+organizations and form the backbone of many business processes. Human
+communication is increasingly happening through digital channels, and the
+COVID-19 pandemic has only accelerated this trend. We are increasingly living
+in a “document society” [53], dependent on documents in our daily lives or for
+recording second-hand knowledge. With instant gratification as the norm in
+the digital age, people expect similar seamless interactions with businesses and
+governments. While digitization has increased the speed and ease of documentbased communication, document processing remains a largely human effort with
+organizations drowning under the sheer volume of documents they receive.
+So why have organizations not switched en masse to
+automated document processing?
+The answer lies for some part in (I) the complexity of the task, and for the
+other part in (II) the need for reliability and risk control.
+(I) While it might be straightforward for a human (white-collar) worker to read
+a long, structured document, understand its contents, categorize it, and extract
+crucial information accordingly, this is not so easy for a machine. This could be
+perceived as an instance of Moravec’s paradox [319], which states that tasks
+that are easy for humans are hard for machines, and vice versa. However, in
+recent times, significant strides forward have been made thanks to technological
+advances combining Natural Language Processing (NLP), Computer Vision
+(CV) and Machine Learning (ML). Document Understanding (DU) is
+the umbrella term for both the end-to-end solution and the research field
+studying to make machines interpret and understand documents (elaborated
+on in Section 2.3). It has seen a surge in interest in the past few years, with
+the rise of large-scale pretrained Language and Vision models (LLM, VLM)
+[52, 94, 101, 187, 380, 383, 502] capable of modeling document inputs.
+What makes DU challenging is that it encompasses multiple subtasks, each of
+which is a research field in its own right, such as Optical Character Recognition
+(OCR), Document Layout Analysis (DLA), Document Classification (DC), Key
+Information Extraction (KIE), Visual Question Answering (VQA), etc. The
+complexity of the task is further increased by the fact that documents are
+multimodal, containing both text and images and that they are compositional,
+i.e., the meaning of the document is not just the sum of its parts. Information
+can appear in a wide range of forms including text, images, tables or graphs,
+and be spread across multiple pages. Moreover, the meaning of a document
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0035.txt b/assets/txts/pg_0035.txt
new file mode 100644
index 0000000000000000000000000000000000000000..236c98703711a3f2beeba1724b9a4950bd424684
--- /dev/null
+++ b/assets/txts/pg_0035.txt
@@ -0,0 +1,46 @@
+INTRODUCTION
+
+3
+
+can change depending on the context in which it is used. As an artifact of the
+communication channel, not all documents are born digitally, and the quality
+of the document can vary greatly, with some documents being handwritten,
+scanned with low resolution, or even a picture of a document. Furthermore,
+documents are often not standardized templates and can be highly variable in
+terms of layout, structure, and content. Finally, the longer the document, the
+more computationally demanding it becomes to process, and the more likely it
+is to induce errors, which can be harder to detect.
+Addressing the inherent challenges of document processing, and achieving high
+levels of accuracy, processing speed, reliability, robustness, and scalability in
+DU forms the applied scope of this thesis.
+(II) Consider the example given of the birth certificate. While I might not
+appreciate as much the manual handling of this document, if they had registered
+my baby girl’s name (Feliz, Spanish writing without an accent on the ‘e’)
+incorrectly, I would be pretty upset as this could have further repercussions.
+Whereas this error might be easily rectified, it is not so easy to do so in the
+case of a mortgage application, where the wrong information could lead to a
+rejection of the application, or even worse, a loan agreement with the wrong
+terms and conditions. This demonstrates that, even when full automation of
+document processing is in high demand, it is not always desirable if the risk of
+failure might be too large.
+Nevertheless, a lot of the potential for automation remains untapped, and
+organizations are increasingly looking for solutions to fully automate their
+document processing workflows. However, full automation, implying perfect
+recognition of document categories and impeccable information extraction is an
+unattainable goal with the current state of technology [79].
+The more realistic objective set is Intelligent Automation (IA) (elaborated
+on in Section 2.4), where the goal is to have the machine estimate confidence
+in its predictions, deriving business value with as high as possible volumes of
+perfect predictions (Straight-Through-Processing, STP) without incurring extra
+costs (False Positives, FP).
+The leitmotif of this thesis will be the fundamental enablers of IA: confidence
+estimation and failure prediction.
+Calibrated uncertainty estimation with efficient and effective DU technology
+will allow organizations to confidently automate their document processing
+workflow, while keeping a human in the loop only for predictions with a higher
+likelihood of being wrong. To date, however, little research has addressed the
+question of how to make DU technology more reliable, as is illustrated in a toy
+analysis (Table 1.1) reporting the absence of many IA-related keywords in the
+Proceedings of the 2021 International Conference on Document Analysis and
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0036.txt b/assets/txts/pg_0036.txt
new file mode 100644
index 0000000000000000000000000000000000000000..19747cca63d44df370b862437a8c601a3f946800
--- /dev/null
+++ b/assets/txts/pg_0036.txt
@@ -0,0 +1,80 @@
+4
+
+INTRODUCTION
+
+Recognition (ICDAR) [289].
+The thesis aims to fill this gap by proposing novel methods for uncertainty
+estimation and failure prediction (Part I), and by providing a framework for
+benchmarking and evaluating the reliability and robustness of DU technology,
+as close as possible to real-world requirements (Part II).
+Table 1.1. Comparative analysis of keywords in the ICDAR 2021 proceedings. While
+many DU subtasks are represented, there is a lack of keywords related to IA. Do note
+that calibration is used in the context of camera calibration, and not in the context of
+confidence estimation.
+keyword
+
+freq
+
+keyword
+
+freq
+
+document
+classification
+
+3388
+242
+
+33
+0
+
+key information
+
+56
+
+question answering
+
+106
+
+layout analysis
+
+223
+
+calibration/calibrate
+temperature scaling
+failure prediction
+misclassification detection
+out-of-distribution
+OOD
+predictive uncertainty
+
+0
+25
+0
+
+In the remainder of the Introduction, I will sketch the surrounding research
+context, followed by the problem statement and research questions, and finally
+the outline of the thesis manuscript.
+
+1.1
+
+Research Context
+
+All chapters of this dissertation have been executed as part of the Baekeland
+PhD mandate (HBC.2019.2604) with financial support of VLAIO (Flemish
+Innovation & Entrepreneurship) and Contract.fit. The latter is a Belgian-based
+software-as-a-service (SaaS) provider of Intelligent Document Processing (IDP)
+drawing on innovations in DU to power their product suite (email-routing,
+Parble), and my generous employer since 2017.
+Some of the joint work (Chapter 5) has been partially funded by a PhD
+Scholarship from AGAUR (2023 FI-3-00223), and the Smart Growth Operational
+Programme under projects no. POIR.01.01.01-00-1624/20 (Hiper-OCR - an
+innovative solution for information extraction from scanned documents) and
+POIR.01.01.01-00-0605/19 (Disruptive adoption of Neural Language Modelling
+for automation of text-intensive work).
+Moreover, given that the dissertation work has been performed over a large
+span of time, it warrants putting it in the larger context and dynamics of AI
+innovations, the state of DU as a field, how notions of ’reliability’ have evolved
+over time, and finally the business context.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0037.txt b/assets/txts/pg_0037.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7e98d1b8513fc389c48dd03f28fdfd521ee8dafd
--- /dev/null
+++ b/assets/txts/pg_0037.txt
@@ -0,0 +1,42 @@
+RESEARCH CONTEXT
+
+5
+
+This thesis started almost concurrently with the rise of the global COVID19 pandemic, making it hard to foster collaborations in the early stages. At
+the start of the PhD, DU methodology was fairly established, with OCR and
+Transformer-based pipelines such as BERT [94] and LayoutLM [502], which
+is why we first prioritized the more fundamental challenge of decision-making
+under uncertainty (Part I); which was followed by a step back, closer to applied
+DU research (Part II).
+The research community’s understanding of ‘reliability’ has also evolved over
+time. When starting the work of Chapter 3, the notion of reliability was mostly
+associated with uncertainty quantification and calibration. However, calibration
+is not a panacea, and only fairly recently, Jaeger et al. [193] proposed a more
+general framework encapsulating reliability and robustness. They promote the
+more concrete and useful notion of failure prediction, which still involves
+confidence/uncertainty estimation yet with an explicit definition of the failure
+source which one wants to detect or guard against, e.g., in-domain test errors,
+changing input feature distributions, novel class shifts, etc. Since I share a
+similar view of the problem, I have focused following works on the more general
+notion of failure prediction, which is also more in line with the business context
+of IA.
+Whereas we originally intended to work on multi-task learning of DU subtasks,
+the rise of general-purpose LLMs offering a natural language interface to
+documents rather than discriminative modeling (e.g., ChatGPT [52, 344]),
+prompted us toward evaluating this promising technology in the context of
+DU. More importantly, we observed the lack of sufficiently complex datasets
+and benchmarks in DU that would allow us to tackle larger, more fundamental
+questions such as ’Do text-only LLMs suffice for most low-level DU subtasks?’
+(subsequently tackled in Chapter 5), which is why we shifted our focus to the
+more applied research questions of benchmarking and evaluation (Part II).
+Finally, the business context has also evolved over time. Originally, IDP was
+practiced by legacy OCR companies; specialized vendors, offering a range of
+solutions for specific document types (e.g., invoices, contracts, tax forms, etc.);
+or cloud service providers, offering IDP as part of a larger suite of services
+(e.g., AWS Textract, Azure Form Recognizer, etc.). However, the rise of both
+open-source LLM development and powerful, though closed-source models has
+lowered the barrier to entry for any new entrants or incumbents. This has led
+to a commoditization of IDP, with the quality of the LLMs and the ease of
+integration with existing business processes becoming key differentiators.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0038.txt b/assets/txts/pg_0038.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6f8b0de8c96bc1502b9dbd808025374cfe0728da
--- /dev/null
+++ b/assets/txts/pg_0038.txt
@@ -0,0 +1,45 @@
+6
+
+1.2
+
+INTRODUCTION
+
+Problem Statement and Questions
+
+The general introduction sketches the context of the research, and motivates
+the research questions. In this Section, I will formulate the problem statement
+and research questions more formally and how they relate to the manuscript’s
+contents.
+
+1.2.1
+
+Reliable and Robust Deep Learning
+
+The dissertation opens with the more fundamental challenge of targeting
+reliability and robustness in Deep Learning, which covers fairly abstract concepts
+that have been used interchangeably and inconsistently in the literature. They
+will be defined more extensively in Section 2.2, but for now, consider reliability
+as the ability to avoid failure, robustness as the ability to resist failure, and
+resilience as the ability to recover from failure [373, 438, 455]. In Chapter 3, we
+focus on the more concrete objective of predictive uncertainty quantification
+(PUQ), which shows promise for improving reliability and robustness in Deep
+Learning (DL) [123, 140, 173, 455]. Concretely, PUQ methods are expected to
+elucidate sources of uncertainty such as a model’s lack of in-domain knowledge
+due to either training data scarcity or model misspecification, or its ability to
+flag potentially noisy, shifted or unknown input data [136].
+We observed that the majority of prior PUQ research focused on regression and
+CV tasks, while the applicability of PUQ methods had not been thoroughly
+explored in the context of NLP. As mentioned earlier, most DU pipelines (in
+2020) were text-centric with a high dependency on the quality of OCR. Since
+OCR is often considered a solved problem [262], we hypothesized that the main
+source of error and uncertainty in DU would reside in the text representations
+learned by deep neural networks (DNN)s. This is why we focused on the
+more fundamental question of how well do PUQ methods scale in NLP? More
+specifically, we restricted the scope to the prototypical, well-studied task of
+text classification, for which we could leverage existing multi-domain datasets
+varying in complexity, size and label space (multi-class vs. multi-label).
+This leads to the following research questions:
+RQ 1. When tested in realistic language data distributions on various text
+classification tasks, how well do PUQ methods fare in NLP?
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0039.txt b/assets/txts/pg_0039.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6e982e5ac139ad4d5fcbace54f49f5db0d7753d8
--- /dev/null
+++ b/assets/txts/pg_0039.txt
@@ -0,0 +1,41 @@
+PROBLEM STATEMENT AND QUESTIONS
+
+7
+
+RQ 2. In which settings are PUQ methods most useful, i.e., which failure sources
+/ distribution shifts are they most sensitive to?
+RQ 3. How can we obtain better PUQ estimates without overrelying on
+computationally prohibitive methods, e.g., Deep Ensemble [238]?
+RQ 4. How important are certain prior, neural architecture or hyperparameter
+influences on the quality of PUQ estimation?
+In a later chapter (Chapter 5), we introduce a complex benchmark for generic
+DU that additionally tests for robustness to domain, visual and layout shifts,
+and explores the novel problem of hallucination and control in natural language
+generation (NLG) with LLMs from the perspective of calibrated and selective
+DocVQA. The general task formulation involves a natural language question (on
+content, aspect, form, visual/layout), an input document, and a set of reference
+answers. The model is expected to provide a natural language answer, an answer
+confidence and a (binary) abstention decision. Evaluation is done in terms of
+answer correctness, calibration and selective prediction. On the one hand, one
+expects a model to lower confidence when unsure about the correctness of a
+predicted answer. On the other hand, one expects a model to abstain from
+answering and refrain from hallucinations on unanswerable questions (which
+had been explicitly added in the dataset).
+RQ 5. How severe is the problem of hallucination and control in LLMs when
+evaluated in a selective, free-form DocVQA task setting?
+
+1.2.2
+
+Realistic and Efficient Document Understanding
+
+The second part of the dissertation focuses on the more applied research questions
+of realistic and efficient DU. The overall objective is to make DU technology
+more generically applicable (Chapter 5), evaluation more in sync with real-world
+requirements (Chapters 4 and 5), and more efficient at modeling the multimodal
+and compositional nature of documents (Chapters 5 and 6).
+Due to the proximity to business applications and the risks of leaking personal
+information, DU research benchmarks have diverged substantially from the
+real-world distributions of document data. For instance, DU datasets are often
+limited to single-page document images, are from outdated sources (e.g., IIT-
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0040.txt b/assets/txts/pg_0040.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fc4a62691bd1c7887f28bbb0c32878d6a1334e0b
--- /dev/null
+++ b/assets/txts/pg_0040.txt
@@ -0,0 +1,35 @@
+8
+
+INTRODUCTION
+
+CDIP [252]), or are restricted to a single domain or a small set of document
+types.
+We posit that larger, fundamental questions in DU remain unanswered due to a
+lack of sufficiently complex datasets and benchmarks with a rich methodology
+covering evaluation beyond the independent and identically distributed (i.i.d.)
+test set setting. While there exist performant models for DU subtasks such
+as OCR, DC, KIE, etc., it is unclear how to move from these specific analysis
+and recognition tasks to models that can reason and understand documents. A
+truly end-to-end DU solution must handle the complexity and variety of realworld documents and subtasks, which could be expressed as natural language
+questions. Moreover, it should be able to generalize to any question on any
+document and reason over multiple pages and modalities.
+The following research questions are addressed in Chapters 4 and 5:
+RQ 6. How can we iteratively close the gap between research and practice in DU?
+RQ 7. How can we design a resource that comprehensively challenges the state-ofthe-art?
+RQ 8. Which DU aspects are most challenging for current state-of-the-art LLMs?
+How can these be incorporated in a benchmark to allow proper measurements
+of future improvements?
+However, moving the goalpost beyond a single-page context inevitably requires
+us to reconsider the research challenge of efficiency in DU. The rise of LLMs
+has enabled a new generation of DU pipelines, which are more flexible and
+easier to maintain than separate and specialized subtask modules, but also
+more computationally demanding. Importantly, most LLMs are not designed
+to handle the multimodality and long context windows of multipage documents,
+and are often unaware of the visual and layout semantics of documents.
+The research questions for Chapter 6 address the efficiency challenge in DU:
+RQ 9. How can we efficiently infuse LLMs with semantic layout awareness for
+more focused information extraction?
+RQ 10. To what degree can model compression resolve the problem of efficiency
+in processing documents?
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0041.txt b/assets/txts/pg_0041.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8931c41526dc7adef34baac4ef2402763feeb09d
--- /dev/null
+++ b/assets/txts/pg_0041.txt
@@ -0,0 +1,27 @@
+OUTLINE
+
+1.3
+
+9
+
+Outline
+
+Figure 1.1. Overview of publications and how they relate to the chapters.
+
+Figure 1.2. Visual Overview of the research questions and how they relate to the
+chapters.
+
+After the introductory Chapters 1 and 2, we continue with the publication-based
+chapters that form the core of the thesis, which are structured in two parts.
+Part I consists of a single chapter, Chapter 3, which presents a benchmarking
+study of PUQ methods applied on real-world text classification datasets with
+1-D convolutional neural networks and pretrained transformers. It motivates
+a novel PUQ method, Deep Ensemble with Concrete Dropout, combining the
+benefits of both methods, and showing promise for improving reliability and
+robustness in NLP at a lower computational cost. The chapter concludes with
+a discussion of the results, including targeted ablation studies, and provides
+recommendations for future research.
+Part II consists of three chapters, Chapters 4 to 6, which all focus on the more
+applied research questions of realistic and efficient DU.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0042.txt b/assets/txts/pg_0042.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c7e2d484b3d991675fe433215940bb7e88c25708
--- /dev/null
+++ b/assets/txts/pg_0042.txt
@@ -0,0 +1,31 @@
+10
+
+INTRODUCTION
+
+Chapter 4 reflects on the current state of DU research, and proposes guidelines to
+foster document dataset construction efforts. It introduces two novel document
+classification datasets, RVL-CDIP_MP and RVL-CDIP-N_MP, as extensions
+of the RVL-CDIP dataset [165] with multipage documents. The datasets are
+accompanied by a comprehensive experimental analysis, which shows promise
+from advancing multipage document representations and inference.
+Chapter 5 introduces the multi-faceted DUDE
+benchmark for assessing
+generic DU, that was also hosted as a competition to challenge the DU
+community. It describes the complete methodology and design of the dataset,
+targeting model innovations that can handle the complexity and variety of
+real-world documents and subtasks, and generalize to any documents and any
+questions. Next to a discussion of the competition results, it also presents
+our own comprehensive benchmarking study of SOTA LLMs with varying the
+context length and what modalities are represented.
+Chapter 6 investigates how to efficiently obtain more semantic document layout
+awareness. We explore what affects the teacher-student knowledge gap in
+KD-based model compression methods, and design a downstream task setup
+to evaluate the robustness of distilled DLA models on zero-shot layout-aware
+DocVQA.
+Finally, Chapter 7 concludes the thesis with a summary of the main contributions
+(Section 7.1), and a discussion of future research directions. As a logical followup to Chapter 5, we propose in Section 7.2.2.1 how the DUDE dataset could
+be extended to become the ‘ultimate’ DU benchmark. The thesis ends with a
+hypothetical, informed design of how the research presented would form part of
+an end-to-end, fully-fledged IA-DU solution (Section 7.2.2.2).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0043.txt b/assets/txts/pg_0043.txt
new file mode 100644
index 0000000000000000000000000000000000000000..36494ff567857e69aa11d563bca196f78d67b2c6
--- /dev/null
+++ b/assets/txts/pg_0043.txt
@@ -0,0 +1,32 @@
+Chapter 2
+
+Fundamentals
+This chapter provides all the necessary background knowledge necessary to
+understand the contributions of this thesis.
+The key questions covered here are:
+i.
+ii.
+iii.
+iv.
+v.
+vi.
+
+How to feed a document to an algorithm to perform arbitrary tasks on it?
+How to model language, vision, layout or structure?
+How does it learn and then operate at inference time?
+How does it estimate prediction uncertainty?
+How to evaluate its performance?
+How to integrate it as a useful, end-to-end system in a document workflow?
+
+Section 2.1 explains the basic setting from the perspective of statistical learning
+theory [472], which is a mathematical framework for analyzing how algorithms
+learn from data with minimal error. Section 2.2 gives a primer on reliability and
+robustness, particularly calibration, failure detection and relevant evaluation
+metrics. Section 2.3 surveys the DU field, and discusses the state of the art in
+DU technology. Finally, Section 2.4 covers Intelligent Automation to illustrate
+how solving the challenges posed in this thesis will enable to augment human
+intelligence, creativity and productivity in straight-through business processes.
+
+11
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0044.txt b/assets/txts/pg_0044.txt
new file mode 100644
index 0000000000000000000000000000000000000000..704dc7be0a638d940005f74cebb080df6d5fba1f
--- /dev/null
+++ b/assets/txts/pg_0044.txt
@@ -0,0 +1,163 @@
+12
+
+FUNDAMENTALS
+
+Contents
+2.1
+
+2.2
+
+2.3
+
+2.4
+
+2.1
+
+Statistical Learning - basics . . . . . . . . . . . .
+2.1.1 Neural Networks . . . . . . . . . . . . .
+2.1.2 Probabilistic Evaluation . . . . . . . . .
+2.1.3 Architectures . . . . . . . . . . . . . . .
+Reliability and Robustness . . . . . . . . . . . .
+2.2.1 Generalization and Adaptation . . . . .
+2.2.2 Confidence Estimation . . . . . . . . . .
+2.2.3 Evaluation Metrics . . . . . . . . . . . .
+2.2.4 Calibration . . . . . . . . . . . . . . . .
+2.2.5 Predictive Uncertainty Quantification . .
+2.2.6 Failure Prediction . . . . . . . . . . . . .
+Document Understanding . . . . . . . . . . . . .
+2.3.1 Task Definitions . . . . . . . . . . . . . .
+2.3.2 Datasets . . . . . . . . . . . . . . . . . .
+2.3.3 Models . . . . . . . . . . . . . . . . . . .
+2.3.4 Challenges in Document Understanding
+Intelligent Automation . . . . . . . . . . . . . .
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+.
+
+12
+14
+15
+17
+18
+19
+20
+21
+25
+27
+29
+30
+31
+33
+34
+35
+38
+
+Statistical Learning
+
+Two popular definitions of Machine Learning (ML) are given below.
+Machine Learning is the field of study that gives computers the ability
+to learn without being explicitly programmed. [406]
+A computer program is said to learn from experience E with respect to
+some class of tasks T, and performance measure P, if its performance
+at tasks in T, as measured by P, improves with experience E. [317]
+Following these, different types of learning problems [472] can be discerned, of
+which the most common (and the one used throughout our works) is supervised
+learning. It defines experience E as a set of input-output pairs for which the
+task T is to learn a mapping f from inputs X ∈ X to outputs Y ∈ Y, and the
+performance measure P is the risk or expected loss (Equation (2.1)), given a
+(0-1) loss function ` : Y × Y → R+ .
+R(f ) = E(X,Y )∼P [`(Y, f (X))]
+
+(2.1)
+
+The mapping f (·; θ) : X → Y is typically parameterized by a set of parameters
+θ (omitted whenever it is fixed) and a hypothesis class F, which is a set of
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0045.txt b/assets/txts/pg_0045.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3afb663ca09e5aa63b40814121f6a0773a88264d
--- /dev/null
+++ b/assets/txts/pg_0045.txt
@@ -0,0 +1,53 @@
+STATISTICAL LEARNING
+
+13
+
+possible functions. The objective is to find a function f ∈ F that minimizes the
+risk, or even better, the Bayes risk
+f ∗ = inf R(f ),
+f ∈F
+
+(2.2)
+
+which is the minimum achievable risk over all functions in F. The latter is only
+realizable with infinite data or having access to the data-generating distribution
+P(X , Y). In practice, Equation (2.2) is unknown, and the goal is to find a
+function fˆ that minimizes the empirical risk
+N
+1 X
+`(yi , f (xi )),
+fˆ =
+N i=1
+
+(2.3)
+
+where (xi , yi ) are N independently and identically distributed (i.i.d.) samples
+drawn from an unknown distribution P on X × Y. This is known as empirical
+risk minimization (ERM), which is a popular approach to supervised learning,
+under which three important processes are defined.
+Training or model fitting is the process of estimating the parameters θ of a
+model, which is done by minimizing a suitable loss function ` over a training
+set D = {(xi , yi )}N
+i=1 of N i.i.d. samples.
+Inference or prediction is the process of estimating the output of a model for
+a given input, which is typically done by computing the posterior probability
+P (y|x) over the output space Y. Classification output is a discrete label, while
+regression output is a continuous value.
+Evaluation involves measuring the quality of a model’s predictions, which is
+typically done by computing a suitable evaluation metric over a test set Dtest
+of i.i.d. samples, which were not used for training.
+However, ERM has its caveats concerning generalization to unseen data,
+requiring either additional assumptions on the hypothesis class F, which
+are known as inductive biases, and/or regularization to penalize the
+complexity of the function class F [445]. In neural networks (discussed in
+detail Section 2.1.1), the former is controlled by the architecture of the network,
+while the latter involves specifying constraints to parameters or adding a
+regularization term to the loss function.
+
+
+fˆ = arg min R̂(f ) + λΨ(θ)
+f ∈F
+
+(2.4)
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0046.txt b/assets/txts/pg_0046.txt
new file mode 100644
index 0000000000000000000000000000000000000000..07884809c8a3e46062eae9a6696f8eb9953f830d
--- /dev/null
+++ b/assets/txts/pg_0046.txt
@@ -0,0 +1,47 @@
+14
+
+FUNDAMENTALS
+
+Equation (2.4) defines regularized empirical risk minimization (RERM),
+where Ψ(θ) is a regularization term and λ is a hyperparameter that controls the
+trade-off between the empirical risk (denoted with R̂) and the regularization
+term.
+All these concepts will be revisited in the context of neural networks in
+Section 2.1.1, where we will also discuss the optimization process of the model
+parameters θ, how inference differs in the case of probabilistic models to estimate
+uncertainty (Section 2.2.5), and how regularization affects confidence estimation
+and calibration (Section 2.2.4).
+
+2.1.1
+
+Neural Networks
+
+An artificial neural network (NN) is a mathematical approximation inspired
+by data processing in the human brain [396]. It can be represented by a
+network topology of interconnected neurons that are organized in layers that
+successively refine intermediately learned feature representations of the input
+[448] that are useful for the task at hand, e.g., classifying an animal by means
+of its size, shape and fur, or detecting the sentiment of a review by focusing on
+adjectives.
+A basic NN building block is a linear layer, which is a linear function of the
+input parameters: f (x) = W x + b, where the bias term b is a constant vector
+shifting the decision boundary away from the origin and the weight matrix
+W holds most parameters that rotate the decision boundary in input space.
+Activation functions (e.g., tanh, ReLu, sigmoid, softmax, GeLu) are used to
+introduce non-linearity in the model, which is required for learning complex
+functions.
+The first deep learning (DL) network (stacking multiple linear layers) dates
+back to 1965 [191], yet the term ‘Deep Learning’ was coined in 1986 [398].
+The first successful DL application was a demonstration of digit recognition
+in 1998 [244], followed by DL for CV [90, 223] and NLP [76]. The recent
+success of DL is attributed to the availability of large datasets, the increase in
+computational power, the development of new algorithms and architectures,
+and the commercial interest of large companies.
+Consider a conventional DL architecture as a composition of parameterized
+functions. Each consists of a configuration of layers (e.g., convolution, pooling,
+activation function, normalization, embeddings) determining the type of input
+transformation (e.g., convolutional, recurrent, attention) with (trainable)
+parameters linear/non-linear w.r.t. the input x. Given the type of input,
+e.g., language which is naturally discrete-sequential, or vision which presents a
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0047.txt b/assets/txts/pg_0047.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95df88d3ea4797f9b22ba3aafdfa7f60033ca999
--- /dev/null
+++ b/assets/txts/pg_0047.txt
@@ -0,0 +1,53 @@
+STATISTICAL LEARNING
+
+15
+
+Sigmoid Function
+1
+σ(z) =
+1 + exp−z
+
+Softmax Function
+exp(z)
+softmax(z) = PK
+k=1 exp(zk )
+
+Table 2.1. Sigmoid and softmax activation functions for binary and multi-class
+classification, respectively.
+
+ready continuous-spatial signal, different DL architectures have been established,
+which will be discussed in Section 2.1.3.
+A K-class classification function with an l-layer NN with d dimensional input x ∈
+Rd is shorthand fθ : Rd → RK , with θ = {θj }lj=1 assumed to be optimized, either
+partially or fully, using backpropagation and a loss function. More specifically,
+it presents a non-convex optimization problem, concerning multiple feasible
+regions with multiple locally optimal points within each. With maximumlikelihood estimation estimation, the goal is to find the optimal parameters
+or weights that minimize the loss function, effectively interpolating the training
+data. This process involves traversing the high-dimensional loss landscape.
+Upon convergence of model training, the optimized parameters form a solution
+in the weight-space, representing a unique mode (specific function fθ̂ ). However,
+when regularization techniques such as weight decay, dropout, or early stopping
+are applied, the objective shifts towards maximum-a-posteriori (MAP), to
+take into account the prior probability of the parameters. The difference in
+parameter estimation forms the basis for several uncertainty estimation methods,
+covered in Section 2.2.5.
+A prediction is a translation of a model’s output to which a standard decision
+rule is applied, e.g., to obtain the top-1/k prediction (Equation (2.5)), or decode
+structured output according to a function maximizing total likelihood with
+optionally additional diversity criteria.
+ŷ = argmax fθ̂ (x)
+
+(2.5)
+
+Considering standard NNs, the last layer outputs a vector of real-valued logits
+z ∈ RK , which in turn are normalized to a probability distribution over K
+classes using a sigmoid or softmax function (Table 2.1).
+
+2.1.2
+
+Probabilistic Evaluation
+
+The majority of our works involves supervised learning with NNs, formulated
+generically as a probabilistic predictor in Definition 1.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0048.txt b/assets/txts/pg_0048.txt
new file mode 100644
index 0000000000000000000000000000000000000000..45036e3b02241e313a1b6480f5de53b3d5cf0636
--- /dev/null
+++ b/assets/txts/pg_0048.txt
@@ -0,0 +1,45 @@
+16
+
+FUNDAMENTALS
+
+Definition 1. Probabilistic predictor f : X → ∆Y that outputs a conditional
+probability distribution P (y 0 |x) over outputs y 0 ∈ Y for an i.i.d. drawn sample
+(x,y).
+|Y|
+
+Definition 2 (Probability Simplex). Let ∆Y := {v ∈ R≥0 : kvk1 = 1} be a
+probability simplex of size |Y| − 1 as a geometric representation of a probability
+space, where each vertex represents a mutually exclusive label and each point
+has an associated probability vector v [368].
+Figure 2.1 illustrates a multi-class classifier, where Y = [K] for K=3 classes.
+photos.google.com
+
+Google Photos
+Home for all your photos and videos,
+automatically organized and easy to
+share.
+
+https://photos.google.com/search/fox
+
+Figure 2.1. Scatter plot of a ternary problem (K = 3, N = 100) in the probability
+simplex space. Example of overconfident misprediction (above is a Shiba Inu dog) and
+correct sharp prediction (clear image of Beagle).
+
+In practice, loss functions are proper scoring rules [330], S : ∆Y × Y → R, that
+measure the quality of a probabilistic prediction P (ŷ|x) given the true label y.
+The cross-entropy (CE) loss is a popular loss function for classification, while
+the mean-squared error (MSE) loss is used for regression. In Section 2.2, we
+will discuss the evaluation of probabilistic predictors in more detail, including
+the calibration of confidence estimates and the detection of out-of-distribution
+samples.
+
+2.1.3
+
+Architectures
+
+Throughout the chapters of the thesis, we have primarily used the following
+NN architectures: Convolutional Neural Networks (CNNs), Transformer
+Networks . We will briefly introduce the building blocks of these architectures,
+with a focus on how they are used in the context of document understanding.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0049.txt b/assets/txts/pg_0049.txt
new file mode 100644
index 0000000000000000000000000000000000000000..21180eaa1fcd8ce1b0a1484273e9eaf9ee6848b5
--- /dev/null
+++ b/assets/txts/pg_0049.txt
@@ -0,0 +1,41 @@
+STATISTICAL LEARNING
+
+2.1.3.1
+
+17
+
+Convolutional Neural Networks
+
+Convolutional Neural Networks (CNNs) [244] are a class of DNNs designed
+primarily for visual and grid-spatial data such as images. They are inspired by
+the visual cortex of animals, which contains neurons that are sensitive to small
+subregions of the visual field, called a receptive field. The receptive fields of
+different neurons partially overlap such that they cover the entire visual field,
+growing larger in deeper layers of the visual cortex.
+
+Figure 2.2. Sketch of a CNN architecture. The input is a 2D image, which is iteratively
+convolved with a set of learned filters detecting specific input features, e.g., edges,
+corners, blobs, to produce feature maps. Feature maps are then downsampled using
+a pooling operation.
+
+As illustrated in Figure 2.2, CNNs are composed of multiple convolutional layers,
+which hierarchically extract features from the input, followed by pooling and
+fully-connected layers to classify the input based on the downsampled features.
+A filter K ∈ Rd×d is a rectangular matrix of trainable weights with width and
+height d typically smaller than the input x. A convolutional layer applies filters
+sliding over the input, with each filter producing a feature map:
+F = K ∗ x,
+
+(2.6)
+
+where the convolution operation ∗ computes a dot product between filter entries
+and the covered portions of the input.
+Thanks to the weight sharing property of the convolution operation, CNNs are
+able to learn translation invariance, i.e., the ability to recognize an object
+regardless of its position in the image. This is particularly useful for object
+detection, where the position of the object in the image is unknown.
+This architecture was used for document image classification and document
+layout analysis (Section 6.3.2). A special version is 1-D CNNs, which we applied
+to one-hot encoded text data in text classification benchmarking (Section 3.4.3).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0050.txt b/assets/txts/pg_0050.txt
new file mode 100644
index 0000000000000000000000000000000000000000..050d62e7b767be3a0eb78d1b0c141afe1dc45d36
--- /dev/null
+++ b/assets/txts/pg_0050.txt
@@ -0,0 +1,46 @@
+18
+
+2.1.3.2
+
+FUNDAMENTALS
+
+Language Neural Networks
+
+The first step to represent language input into a format compatible with NNs is
+to convert units of language, words or characters or “tokens” as depending on
+a tokenizer, into numerical vectors. This is done by means of embeddings,
+which are typically learned as part of the training process, and are used to
+represent the meaning of words in a continuous vector space. There have been
+multiple generations of word embeddings, starting with one-hot vectors that
+represent each word by a vector of zeros with a single one at its vocabulary index,
+which depends highly on the tokenizer used and does not capture semantic
+relationships between words. Alternatives are frequency-based embeddings,
+such as TF-IDF vectors, which represent each word by its frequency in the
+corpus, weighted by its inverse frequency in the corpus, capturing some lexical
+semantics, but not the context in which the word appears. The next generation
+are Word2Vec embeddings that are trained to predict the context of a word, i.e.,
+the words that appear before and after it in a sentence. FastText embeddings
+improve this by considering a character n-gram context, i.e., a sequence of n
+characters. The current generation are contextual word embeddings that
+are trained to predict the context of a word, taking into account the surrounding
+context and learning the sense of a word based on its context, e.g., ‘bank’ as
+a river bank vs. a financial institution in ‘Feliz sits at the bank of the river
+Nete’. Another important innovation is subword tokenization to deal with
+the out-of-vocabulary (OOV) problem, which is particularly relevant for
+morphologically rich languages, such as Dutch, where word meaning can be
+inferred from its subwords. A clever extension is byte pair encoding (BPE)
+[412], which is a data compression algorithm that iteratively replaces the most
+frequent pair of bytes in a sequence with a single, unused byte, until a predefined
+vocabulary size is reached. This is particularly useful for multilingual models,
+where the vocabulary size would otherwise be too large to fit in memory.
+The first embedding layer is typically a lookup table, which maps each word
+to a unique index in a vocabulary, and each index to a vector of real numbers.
+The embedding layer is typically followed by a recurrent, convolutional or
+attention layer, which is used to capture the sequential nature of language.
+Recurrent Neural Networks (RNNs) and recurrent architectures extended
+to model long-range dependencies such as Long Short-Term Memory (LSTM)
+and Gated Recurrent Unit (GRU) networks were the dominant architectures
+for sequence modeling in NLP, yet they have been superseded by Transformers
+in recent years.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0051.txt b/assets/txts/pg_0051.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3de9aa17deb3cf903ad6925ca5c68ef5ef850e5e
--- /dev/null
+++ b/assets/txts/pg_0051.txt
@@ -0,0 +1,58 @@
+STATISTICAL LEARNING
+
+2.1.3.3
+
+19
+
+Transformer Network
+
+A Transformer [473] is a sequence-to-sequence model that uses an attention
+mechanism to capture long-range dependencies in the input sequence, benefiting
+from increased parallelization. Traditionally, it consists of an encoder and a
+decoder, each composed of multiple layers of self-attention and feed-forward
+layers.
+Attention is a mechanism that allows for soft selection of relevant information
+from a set of candidates, e.g., tokens in a document, based on a query, e.g.,
+a token in the document. The scaled dot-product P
+attention is defined
+n
+for a sequence of length n as follows: Att(Q, K, V ) = i=1 αi Vi . It utilizes
+three learnable weight matrices, each multiplied with all token embeddings in a
+sequence to build queries Q ∈ Rn×dq , keys K ∈ Rn×dq , and values V ∈ Rn×dv .
+The output of the attention mechanism is a weighted sum of the unnormalized
+values, where each attention weight of the i-th key is computed by normalizing
+exp(QT
+i Ki )
+the dot product between the query and key vectors αi = Pn exp(Q
+T K ) . For
+j=1
+
+J
+
+j
+
+training stability, the dot product is typically scaled by the square root of the
+dimensionality of the query and key vectors. This is followed by a feed-forward
+layer to capture non-linear relationships between the tokens in the sequence.
+There exist different forms of attention, depending on the type of relationship
+that is captured. Self-attention computes the attention of each token w.r.t.
+all other tokens in the sequence, which changes the representation of each token
+based on the other tokens in the sequence. Multi-head attention is a set
+of h attention layers, which every Transformer uses to concurrently capture
+different types of relationships, concatenated together after the parallelized
+processing. Cross-attention computes the attention of each token in one
+sequence w.r.t. all tokens in another sequence, which is used in encoder-decoder
+Transformer architectures for e.g., summarization and machine translation.
+Specific to decoder layers, masked attention is used to prevent the decoder
+from attending to future tokens in the sequence by masking the upper triangle
+of the attention matrix calculation.
+A major downside to Transformers is the quadratic complexity of the attention
+mechanism (Figure 2.3), which makes them computationally inefficient for long
+sequences. This has been addressed by a wealth of techniques [120], such as
+sparsifing attention, targeting recurrence, downsampling, random or low-rank
+approximations.
+Position Embeddings are indispensable for Transformers to be able to process
+sequences, as they do not have any notion of order or position of tokens in
+a sequence. The most common type of position embedding is a sinusoidal
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0052.txt b/assets/txts/pg_0052.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95fde3416660d5dc9eb5031341f239b682ead4d3
--- /dev/null
+++ b/assets/txts/pg_0052.txt
@@ -0,0 +1,38 @@
+20
+
+FUNDAMENTALS
+
+Quadratic complexity
+
+Figure 2.3. Illustration of the main attention mechanisms in a Transformer.
+
+embedding with a fixed frequency and phase, f (x) = sin(ωx + φ), where ω is the
+frequency and φ is the phase which are learned as part of the training process,
+and they are typically shared across all tokens in the sequence. Integrating
+position information into Transformers can be achieved in different ways, which
+[105, Table 1] gives an overview for.
+Transformers have gradually taken over as an end-to-end architecture for both
+NLP and CV tasks, albeit adoption in CV has been slower, due to the lack
+of spatial invariance in the original Transformer architecture. This has been
+addressed by recent works, such as Vision Transformer (ViT) [101], which uses
+a patch-based input representation with position embeddings.
+A large language model (LLM) consists of a stack of Transformers that is
+pretrained on a large corpus of text, typically using a self-supervised learning
+objective, such as predicting the next token in a sequence. The goal of LLMs
+is to learn a general-purpose language representation that can be fine-tuned
+to perform well on a wide range of downstream tasks. LLMs have disrupted
+NLP in recent years, as they have achieved SOTA performance on a wide
+range of tasks thanks to pretraining on large amounts of data. The most
+popular LLMs are BERT [95], RoBERTa [287], ELECTRA [73], T5 [383],
+GPT-3 [52], Llama-2 [452], and Mistral [199]. Next to challenges specific to
+modeling document inputs, explained in Section 2.3.4, open challenges for
+LLMs include: (i) structured output generation, (ii) domain-specific knowledge
+injection (e.g., does retrieval-augmented generation (RAG) suffice? [253, 347]),
+(iii) multimodality.
+Vision-language models (VLM) are a recent development in multimodal
+learning, which combine the power of LLMs with vision encoders to perform
+tasks that require understanding both visual and textual information. The most
+popular VLMs are CLIP [381], UNITER [70], FLAVA [423] and GPT-4 [344].
+In every chapter of this dissertation we have used Transformers, either as part
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0053.txt b/assets/txts/pg_0053.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2a9a308e37ea6c8bae8d6d4688fd3f439f56bc99
--- /dev/null
+++ b/assets/txts/pg_0053.txt
@@ -0,0 +1,46 @@
+RELIABILITY AND ROBUSTNESS
+
+21
+
+of a foundation model for DU tasks (Chapters 4 to 6) or to contrast with 1-D
+CNNs in text classification (Chapter 3). Note that [265] share our concerns that
+NLP needs a new ‘playground’ with more realistic tasks and benchmarks, which
+extend beyond sentence-level contexts to more complex document-level tasks.
+Alternative sub-quadratic architectures have started addressing Transformer’s
+computational inefficiency on long sequences, e.g., Mamba [152] and Longnet
+[99]. Time will tell if these will be able to compete with the Transformer’s
+dominance in foundation models.
+
+2.2
+
+Reliability and Robustness
+
+Chapter 3 contains a lot of relevant content on the basic relation between
+uncertainty quantification, calibration, and distributional generalization or
+detection tasks. Here, we will focus on the more general concepts of reliability
+and robustness, and how they relate to concepts used throughout the rest of
+the thesis. Next, we discuss the need for confidence estimation and appropriate
+evaluation metrics, followed by short summaries of the main research trends in
+calibration and uncertainty quantification.
+Emerging guidance and regulations [2, 3, 475] place increasing importance on
+the reliability and robustness of ML systems, particularly once they are used
+in the public sphere or in safety-critical applications. In ML, reliability and
+robustness are often used interchangeably [78, 420, 455], yet they are distinct
+concepts, and it is important to understand the difference between them. This
+thesis uses the following definitions of reliability and robustness, adapted from
+systems engineering literature [395]:
+Definition 3 [Reliability]. Reliability is the ability of a system to consistently
+perform its intended function in a specific, known environment for a specific
+period of time, with a specific level of expected accuracy [395]. Closer to the ML
+context, this entails all evaluation under the i.i.d. assumption, allowing for some
+benign shifts of the distribution, including predictive performance evaluation
+with task-dependent metrics (accuracy, F1, perplexity, etc.), calibration, selective
+prediction, uncertainty estimation, etc.
+Reliability requires to clearly specify the role an ML component plays in a
+larger system, and to define the expected behavior of the system as a function
+of alignment with the training data distribution. This is particularly important
+in the context of black-box models, where the inner workings of the model are
+not transparent to the user. In this case, the user needs to be aware of the
+model’s limitations, e.g., model misspecification, lack of training data, and the
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0054.txt b/assets/txts/pg_0054.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8311870e0107a75e565ce0ffd60775b808ca2926
--- /dev/null
+++ b/assets/txts/pg_0054.txt
@@ -0,0 +1,45 @@
+22
+
+FUNDAMENTALS
+
+model needs to be able to communicate its own uncertainty to the user. This is
+the focus of Chapter 3.
+Definition 4 [Robustness]. Robustness is the ability of a system to maintain
+its intended function despite a wide range of disturbances, with a minimal
+degradation of performance [395]. Such disturbances can take the form of
+adversarial attacks, distributional shifts, or other types of noise. In the ML
+context, this entails all evaluation violating the i.i.d. assumption, including
+adversarial and label noise robustness, out-of-distribution detection, domain
+generalization, extrapolation, etc.
+Robustness is more involved with the application scope in which a model can
+perform well, assuming that the model can maintain some degree of its prediction
+capacity on non-i.i.d. data which might be unknown at training time. Detecting
+when the model is operating outside of its intended scope is an important part
+of robustness to prevent failure propagation to downstream systems.
+Resilience is another component of the R3 : reliability, robustness, resilience
+concept in systems engineering, yet it is not a focus of this thesis, nor is it
+a relevant qualifier of the ML model in isolation, as it is more related to the
+system as a whole. Resilient systems are able to recover from disturbances, even
+those caused by model misspecification, e.g., by adapting to new environments
+and unexpected inputs from unknown distributions or by self-healing.
+
+2.2.1
+
+Generalization and Adaptation
+
+To complete the R3 picture, we cannot overlook the generalizationadaptation spectrum, which has been less explored in our works, yet it is an
+important part of current practices in ML.
+Definition 5 [Generalization-adaptation]. Generalization is the ability of
+a system to perform its intended function in a wide range of environments,
+including those not known at design time [395]. Each environment is defined by
+a data distribution over a domain and a task, and generalization is the ability
+of a model to perform well on new data drawn from the same distribution.
+Adaptation is the ability of a system to perform its intended function in a specific,
+known environment, despite changes in the system itself or its environment
+[395]. This entails the ability of a model to perform well on new data drawn
+from a different distribution, which is known at design time.
+Different settings of generalization-adaptation are: in-distribution (same
+domain and task), domain generalization (same task, different domain), task
+generalization (same domain, different task), out-of-distribution (different
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0055.txt b/assets/txts/pg_0055.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1abe0408da2c51ef0a6e8497de5a0ab28cd8a8ac
--- /dev/null
+++ b/assets/txts/pg_0055.txt
@@ -0,0 +1,45 @@
+RELIABILITY AND ROBUSTNESS
+
+23
+
+domain or task). If the model has access to limited samples for training
+on the new distribution, it is referred to as few-shot learning or no samples at
+all, zero-shot learning; if it is able to adapt to new distributions over time, or
+accumulate knowledge over different tasks without retraining from scratch [87],
+it is referred to as continual learning or incremental learning.
+Many of these settings are referred to in business as out-of-the-box, self-learning,
+yet without any formal definitions given. Domain and task generalization are
+major selling points of pretrained LLMs, which are able to perform well on a
+wide range of tasks and domains. In the case of very different distributions, e.g.,
+a different task/expected output or an additional domain/input modality, it is
+often necessary to fine-tune the model on a small amount of data from the new
+distribution, which is known as transfer learning. Specific to LLMs, instruction
+tuning is a form of transfer learning, where samples from a new distribution are
+appended with natural language instructions [69, 532]. This approach has been
+used in Chapter 5 to adapt pretrained LLMs to the task of DocVQA, in an
+effort to reduce the amount of annotated data required to generalize to unseen
+domains and questions.
+
+2.2.2
+
+Confidence Estimation
+
+A quintessential component of reliability and robustness requires a model to
+estimate its own uncertainty, or inversely to translate model outputs into
+probabilities or ‘confidence’ (Definition 6).
+Definition 6 [Confidence Scoring Function]. Any function g : X → R
+whose continuous output aims to separate a model’s failures from correct
+predictions can be interpreted as a confidence scoring function (CSF) [193].
+Note that while it is preferable to have the output domain of g ∈ [0, 1] for easier
+thresholding, this is not a strict requirement.
+Circling back on the question of why one needs a CSF, there are multiple reasons:
+i) ML models are continually improving, yet 0 test error is an illusion, even a
+toy dataset (MNIST) is not perfectly separable; ii) once a model is deployed,
+performance deterioration is expected due to i.i.d. assumptions breaking; iii)
+generative models are prone to hallucinations [198], requiring some control
+mechanisms and guardrails to guide them.
+Below, we present some common CSFs used in practice [114, 172, 194, 539],
+where for convenience the subscript is reused to denote the k-th element of the
+output vector g(x) = gk (x).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0056.txt b/assets/txts/pg_0056.txt
new file mode 100644
index 0000000000000000000000000000000000000000..360f4c0f96484049d3e806db3df3c857839d25a8
--- /dev/null
+++ b/assets/txts/pg_0056.txt
@@ -0,0 +1,39 @@
+24
+
+FUNDAMENTALS
+
+I. Maximum softmax probability (MSP): g(x) = maxy0 ∈Y fy0 (x)
+II. Maximum logit: g(x) = maxy0 ∈Y zy0 (x), with logits z ∈ RK
+P
+III. Negative entropy: g(x) = − y0 ∈Y fy0 (x) log fy0 (x)
+IV. Margin: g(x) = maxy0 ∈Y fy0 (x) − maxy00 ∈Y\y0 fy00 (x)
+V. Distance-based measures
+• kNN distance: A 1D outlier score derived from the average distance
+of the feature representation of x to its k nearest neighbors in the
+training distribution
+• Mahalanobis distance [390]: The minimum distance of the feature
+map (e.g., penultimate layer activations) of a test input to classconditional Gaussian distributions of the training data.
+VI. Bayesian uncertainty estimation
+Chapter 3 used MSP and negative entropy as CSFs, next to various PUQ
+methods for Bayesian uncertainty estimation. Other chapters used MSP as it
+is the most common CSF in practice, requiring only logits as input. From the
+use of CSFs also follows the need to evaluate their statistical quality next to
+task-specific predictive performance metrics, which is discussed next.
+
+2.2.3
+
+Evaluation Metrics
+
+In an ideal world, the evaluation metric of interest would be the same as the loss
+function used for training, yet this is rarely the case in practice, as the gradientbased optimization process requires a continuously differentiable function, while
+the metric of interest is often non-differentiable, e.g., accuracy vs. cross-entropy
+in classification.
+Throughout our works, we have used (or extended) multiple predictive
+performance, calibration, and robustness metrics, of which the most interesting
+are respectively outlined.
+Average Normalized Levenshtein Similarity (ANLS) is a metric introduced in [39] for the evaluation of VQA, which was then extended [449] to
+support lists and be invariant to the order of provided answers. We adapted the
+underlying Levenshtein Distance (LD) metric [251] to support not-answerable
+questions, NA(G) = I[type(G) = not-answerable ] (see Equation (2.7)).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0057.txt b/assets/txts/pg_0057.txt
new file mode 100644
index 0000000000000000000000000000000000000000..710d4262ca8a1fb3793aba52f42a37bde4048ea5
--- /dev/null
+++ b/assets/txts/pg_0057.txt
@@ -0,0 +1,98 @@
+RELIABILITY AND ROBUSTNESS
+
+25
+
+Consider for simplicity, the evaluation of a single non-list ground truth answer
+G and prediction P̂ , each with string lengths |G| and |P̂ |, respectively.
+
+1 if NA(G) ∧ |P̂ | > 0,
+
+
+
+
+
+0 if NA(G) ∧ |P̂ | = 0,
+
+
+
+
+ |G| if |P̂ | = 0,
+LD(G, P̂ ) =
+LD(tail(G), tail(P̂ )) if G[0] = P̂ [0],
+
+
+
+
+if G[0] 6= P̂ [0] (deletion),
+ LD(tail(G), P̂ )
+
+
+
+
+1 + min
+LD(G, tail(P̂ ))
+if G[0] 6= P̂ [0] (insertion),
+
+
+
+
+LD(tail(G), tail(P̂ )) if G[0] 6= P̂ [0] (substitution)
+(2.7)
+Each of the conditions is tested in turn, and the first one that is true is executed.
+The normalized similarity metric is then defined as
+NLS(G, P̂ ) =
+
+1 − LD(G, P̂ )
+max(1, |G|, |P̂ |)
+
+.
+
+Given multiple ground truth answer variants G = {a1 , a2 , ...} and a predicted
+answer for P̂Qi for each question Q in the test set of size N , we define the
+complete metric as follows:
+N 
+
+
+1 X
+ANLS =
+max s a, P̂Qi
+N i=1 a∈Gi
+
+
+
+
+
+s a, P̂Qi =
+
+
+
+
+ NLS a, P̂Q
+i
+ 0
+
+
+
+if NLS a, P̂Qi > τ
+
+
+,
+if NLS a, P̂Qi < τ
+
+(2.8)
+
+(2.9)
+
+where we follow prior literature [39, 449] in setting the threshold τ = 0.5.
+In the case of a list-type question, Hungarian matching is performed following
+[449] according to NLS between each ground truth answer part and each
+prediction answer part.
+Proper scoring rules [330] are used for generic evaluation of predictive
+performance, which calculate scoring at the instance-level while measuring both
+the quality of the predictive function and predicted probability distribution (as
+they are not compatible with an arbitrary CSF):
+• Negative Log Likelihood (NLL) [378] is both a popular loss function
+(cross-entropy) and scoring rule which only penalizes (wrong) log
+probabilities qi given to the true class, with I an indicator function defining
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0058.txt b/assets/txts/pg_0058.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bcfc4dac5b1d5e2262610a1230c12f82c6435565
--- /dev/null
+++ b/assets/txts/pg_0058.txt
@@ -0,0 +1,62 @@
+26
+
+FUNDAMENTALS
+
+the true class. This measure more heavily penalizes sharp probabilities,
+which are close to the wrong edge or class by over/under-confidence.
+`NLL (f ) = −
+
+N K
+1 XX
+I [yi = k] · log (fk (xi ))
+N i=1
+
+(2.10)
+
+k=1
+
+• Brier Score [50] is a scoring rule that measures the accuracy of a
+probabilistic classifier and is related to the mean-squared error (MSE) loss
+function. Brier score is more commonly used in industrial practice since it
+is an λ2 metric (score between 0 and 1), yet it penalizes tail probabilities
+less severely than NLL.
+`BS (f ) =
+
+N K
+1 XX
+2
+(I (yi = k) − fk (xi ))
+N i=1
+
+(2.11)
+
+k=1
+
+All metrics following require a CSF g(x) to be defined, and can pertain to
+specific evaluation settings [389] tested in Section 3.4.5.
+Expected Calibration Error (ECE) [156, 332] is a default metric to evaluate
+top-1 prediction miscalibration. A calibration estimator (Definition 7) measures
+the Lp norm difference between a model’s posterior and the true likelihood of
+being correct.
+Definition 7 (Lp Calibration Error). [231, 463]
+The Lp calibration error of f : X → ∆Y over the joint distribution (X × Y )
+with the Lp norm p ∈ [1, ∞) is given by:
+
+
+CEp (f )p = E(X,Y ) kE[Y | f (X)] − f (X)kpp
+(2.12)
+The popular ECE metric [332] with condition I[Y = ŷ] is a special case of the
+above with p = 1, where the expectation is approximated using a histogram.
+MaxCE defines the worst-case risk version with p = ∞, effectively reporting on
+the bin with the highest error. As part of Chapter 5, we contributed a novel
+empirical estimator of top-1 calibration for the task of VQA, where the exact
+accuracy condition I[Y = ŷ] in ECEis replaced by I[ANLS(y, ŷ) > τ ]. Prior
+work [329] used a similar strategy of thresholding continuous quality scores to
+be able to estimate ECE.
+In practice, ECE is implemented as a histogram binning estimator that
+discretizes predicted probabilities into ranges of possible values for which
+conditional expectation can be estimated. Concretely, the probability space
+is partitioned into B bins bi with i ∈ {1, ..., B}, where for each bin bi the gap
+between observed accuracy and bin confidence P¯b is measured, with a final
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0059.txt b/assets/txts/pg_0059.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d8e87804a4ad73a4a219ecb87bf6d4d9a5b0867d
--- /dev/null
+++ b/assets/txts/pg_0059.txt
@@ -0,0 +1,64 @@
+RELIABILITY AND ROBUSTNESS
+
+27
+
+average weighted by the number of samples per bin |bi |.
+ECE =
+
+B
+X
+|bi |
+i=1
+
+N
+
+acc(bi ) − P¯b (bi )
+
+(2.13)
+
+To minimize the drawbacks inherited from histogram binning, as suggested
+by the literature [231, 342, 393, √
+463], we have applied an equal-mass binning
+scheme with 100 bins (close to N ). While plenty of histogram-based ECE
+estimator implementations exist, many design hyperparameters are not reported
+or exposed:
+I.
+II.
+III.
+IV.
+V.
+
+`p norm
+The number of bins (beyond the unfounded default of |B| = 15)
+Different binning schemes (equal-range, equal-mass)
+Binning range to define the operating zone
+Proxy used as bin accuracy (lower-e.g., center, upper-edge)
+
+We upstreamed 1 a generic implementation of binning-based ECE as part of
+the ICDAR 2023 DUDE competition (Chapter 5).
+Alternative formulations have been developed for multi-class [342, 370, 492]
+and multi-label calibration [493, 520]. Measurements of “strong” calibration,
+over the full predicted vector instead of the winning class, are reported less in
+practice. Possible reasons are that they render class-wise scorings, either based
+on adaptive thresholds or require estimation of kernel-based calibration error
+to derive hypothesis tests. While we are mindful of alternatives (revisited in
+Section 2.2.4), we have found that the simpler “weak” calibration measured by
+ECE meets the practical requirements for most of our benchmarking.
+Area-Under-Risk-Coverage-Curve (AURC) [138, 193] measures the possible trade-offs between coverage (proportion of test set%) and risk (error %
+under given coverage). The metric explicitly assesses i.i.d. failure detection
+performance as desired for safe deployment. It has advantages as a primary
+evaluation metric given that it is effective both when underlying prediction
+models are the same or different (as opposed to AUROC or AUPR). Its most
+general form (without any curve approximation), with a task-specific evaluation
+metric ` and CSF g, is defined as:
+
+
+E(x̃,ỹ)∼PXY [`([f (x̃)], ỹ)I[g(x̃) > g(x)]]
+AURC(f, g) = Ex∼P(X)
+(2.14)
+Ex̃∼PX [I[g(x̃) > g(x)]]
+This captures the intuition that the CSF g should be able to rank instances by
+their risk, and that the risk should be low for instances with high confidence.
+1 https://huggingface.co/spaces/jordyvl/ece
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0060.txt b/assets/txts/pg_0060.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f9e43dc8376e2bc346aa1c72fff9d518c2648cbc
--- /dev/null
+++ b/assets/txts/pg_0060.txt
@@ -0,0 +1,53 @@
+28
+
+FUNDAMENTALS
+
+The standard curve metric can be obtained by sorting all CSF estimates and
+P
+T P +F P
+evaluating risk ( T PF+F
+P ) and coverage ( T P +F P +F N +T N ) for each threshold t (P
+if above threshold) from high to low, together with their respective correctness (T
+if correct). This is normally based on exact match, yet for generative evaluation
+in Section 5.3.5, we have applied ANLS thresholding instead. Formulated
+this way, the best possible AURC is constrained by the model’s test error
+(1-ANLS) and the number of test instances. AURC might be more sensible for
+evaluating in a high-accuracy regime (e.g., 95% accuracy), where risk can be
+better controlled and error tolerance is an apriori system-level decision [115].
+This metric was used in every chapter of Part II.
+For the evaluation under distribution shift in Chapter 3, we have used binary
+classification metrics following [172], Area Under the Receiver Operating
+Characteristic Curve (AUROC) and Area Under the Precision-Recall
+Curve (AUPR), which are threshold-independent measures that summarize
+detection statistics of positive (out-of-distribution) versus negative (indistribution) instances. In this setting, AUROC corresponds to the probability
+that a randomly chosen out-of-distribution sample is assigned a higher confidence
+score than a randomly chosen in-distribution sample. AUPR is more informative
+under class imbalance.
+
+2.2.4
+
+Calibration
+
+The study of calibration originated in the meteorology and statistics literature,
+primarily in the context of proper loss functions [330] for evaluating
+probabilistic forecasts. Calibration promises i) interpretability, ii) system
+integration, iii) active learning, and iv) improved accuracy. A calibrated model,
+as defined in Definition 8, can be interpreted as a probabilistic model, which can
+be integrated into a larger system, and can guide active learning with potentially
+fewer samples. Research into calibration regained popularity after repeated
+empirical observations of overconfidence in DNNs [156, 339].
+Definition 8 (Perfect calibration). [86, 88, 520] Calibration is a property of
+an empirical predictor f , which states that on finite-sample data it converges
+to a solution where the confidence scoring function reflects the probability ρ of
+being correct. Perfect calibration, CE(f ) = 0, is satisfied iff:
+P(Y = Ŷ | f (X) = ρ) = ρ,
+
+∀ρ ∈ [0, 1]
+
+(2.15)
+
+Below, we characterize calibration research in two directions: (A) CSF evaluation
+with both theoretical guarantees and practical estimation methodologies
+• Estimators for calibration notions beyond top-1 [229, 231, 342, 463]
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0061.txt b/assets/txts/pg_0061.txt
new file mode 100644
index 0000000000000000000000000000000000000000..62db2eb7e2431469b8fd75b7a5d447512fc527b9
--- /dev/null
+++ b/assets/txts/pg_0061.txt
@@ -0,0 +1,42 @@
+RELIABILITY AND ROBUSTNESS
+
+29
+
+• Theoretical frameworks to generalize over existing metrics and design
+novel metrics [43, 231, 492, 493]
+• Specialize towards a task such as multi-class classification [463], regression
+[228, 428], or structured prediction [227]
+• Alternative error estimation procedures, based on histogram regression
+[156, 331, 332, 340, 343], kernels [230, 370, 492, 493] or splines [159]
+(B) Calibration methods for improving the reliability of a model by adapting
+the CSF or inducing calibration during training of f :
+• Learn a post-hoc forecaster F : f (X) → [0, 1] on top of f (overview: [298])
+• Modify the training procedure with regularization (overview: [277, 370])
+Due to its importance in practice, we will provide more detail on train-time
+calibration methods. It has been shown for a broad class of loss functions
+that risk minimization leads to Fisher consistent, Bayes optimal classifiers in
+the asymptotic limit [25, 495]. These can be shown to decompose into a sum
+of multiple metrics including both accuracy and calibration error [144, 177].
+However, there is no –finite data, nor asymptotic– guarantee that classifiers
+trained with proper loss functions containing an explicit calibration term
+will eventually be well-calibrated. In practice, being entangled with other
+optimization terms often leads to sub-optimal calibration. For this reason,
+recent studies [12, 230, 492] have derived trainable estimators of calibration
+to have a better handle (γ > 0) on penalizing miscalibration, i.e., by jointly
+optimizing risk (R(f ) = EX,Y [` (Y, f (X))]) and parameterized calibration error
+(CE) as in Equation (2.16).
+fˆ = arg min (R(f ) + γ CE(f ))
+f ∈F
+
+(2.16)
+
+Many of these methods are implicitly or explicitly maximizing entropy of
+predictions or entropy relative to another probability distribution, e.g., Entropy
+Regularization [361], Label Smoothing (LS) [327], Focal Loss [324], Marginbased LS [277], next to more direct (differentiable), kernel-based calibration
+error estimation [211, 230, 370, 492, 493, 526]. We had expected community
+contribution on the DUDE competition (Chapter 5) to take advantage of this
+wealth of calibration methods, yet the majority of submissions used uncalibrated
+models with MSP, requiring more education on the importance of calibration
+in practice.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0062.txt b/assets/txts/pg_0062.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0d8b1f8da63e4accbb0f8da2ec7c8ee7a9aa0f58
--- /dev/null
+++ b/assets/txts/pg_0062.txt
@@ -0,0 +1,50 @@
+30
+
+FUNDAMENTALS
+
+For the sake of completeness, there exist different notions of calibration, differing
+in the subset of predictions considered over ∆Y [463]:
+I. top-1 [156]
+II. top-r [159]
+III. canonical calibration [51]
+Formally, a classifier f is said to be canonically calibrated iff,
+P(Y = yk | f (X) = ρ) = ρk
+
+∀k ∈ [K] ∧ ∀ρ ∈ [0, 1]K where K = |Y|. (2.17)
+
+However, the most strict notion of calibration becomes infeasible to compute
+once the output space cardinality exceeds a certain size [157].
+For discrete target spaces with a large number of classes, there is plenty interest
+in knowing that a model is calibrated on less likely predictions as well. Some
+relaxed notions of calibration have been proposed, which are more feasible
+to compute and can be used to compare models on a more equal footing.
+These include: top-label [157], top-r [159], within-top-r [159], marginal
+[229, 231, 342, 492].
+
+2.2.5
+
+Predictive Uncertainty Quantification
+
+Bayes’ theorem [26] is a fundamental result in probability theory, which
+provides a principled way to update beliefs about an event given new evidence.
+Bayesian Deep Learning (BDL) methods build on these solid mathematical
+foundations and promise reliable predictive uncertainty quantification (PUQ)
+[124, 136, 140, 238, 301, 325, 326, 464, 466, 496].
+The Bayesian approach consists of casting learning and prediction as an
+inference task about hypotheses (uncertain quantities, with θ representing
+all BNN parameters: weights w, biases b, and model structure) from training
+N
+data (measurable quantities, D = {(xi , yi )}i=1 = (X, Y )).
+Bayesian Neural Networks (BNN) are in theory able to avoid the pitfalls
+of stochastic non-convex optimization on non-linear tunable functions with
+many high-dimensional parameters [300]. More specifically, BNNs can capture
+the uncertainty in the NN parameters by learning a distribution over them,
+rather than a single point estimate. This offers advantages in terms of data
+efficiency, avoiding overfitting thanks to regularization from parameter priors,
+model complexity control, and robustness to noise due to the probabilistic
+nature. However, they come with their own challenges such as the increased
+computational cost of learning and inference, the difficulty of specifying
+appropriate weight or function priors, and the need for specialized training
+algorithms or architectural extensions.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0063.txt b/assets/txts/pg_0063.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2fff5d20cef344c8cbd5669f94286869ba034de4
--- /dev/null
+++ b/assets/txts/pg_0063.txt
@@ -0,0 +1,67 @@
+RELIABILITY AND ROBUSTNESS
+
+31
+
+For a fixed model m, the analytically intractable Bayesian posterior distribution
+of the parameters θ is given by Bayes’ rule:
+P (D | θ)
+P (θ | D) =
+
+P (D | θ)P (θ | m)
+P (D | m)
+
+P (θ)
+P (θ | D)
+
+likelihood of θ (in model m)
+prior probability of θ
+
+(2.18)
+
+posterior of θ given data D
+
+The denominator P (D|m) is intractable, since it requires integrating over all
+possible parameter values weighted by their probabilities. This is known as
+the inference problem, which is the main challenge in BDL, as the posterior
+distribution is required to compute the predictive distribution for any new input
+(Equation (3.1) further explains this).
+In practice, BNNs are often implemented as Variational Inference (VI)
+methods, which approximate the high-dimensional posterior distribution with a
+tractable distribution family, such as a Gaussian distribution [46]. Let p(θ | D)
+be the intractable posterior distribution of parameters θ given observed data D,
+which will be approximated with a simpler, conjugate distribution q(θ|D; φ),
+parameterized by φ (e.g., mean and variance).
+The key idea consists of finding the optimal variational parameters φ∗ that
+minimize the Kullback–Leibler (KL) divergence between the approximating
+distribution q(θ|D; φ) and the replaced true posterior p(θ | D). This is achieved
+by maximizing the evidence lower bound (ELBO), given by:
+
+ELBO(φ) = Eq(θ|D;φ) [log p(D|θ)] − KL[q(θ|D; φ)||p(θ)]
+Z
+
+(2.19)
+
+p(D|θ)p(θ)
+dθ
+(2.20)
+q(θ|D; φ)
+Z
+Z
+q(θ|D; φ)
+= q(θ|D; φ) log p(D|θ)dθ − q(θ|D; φ) log
+dθ, (2.21)
+p(θ)
+
+=
+
+q(θ|D; φ) log
+
+where the first term Equation (2.21) represents the expected likelihood of the
+data given the parameters, and the second term quantifies the dissimilarity
+between the variational distribution and the prior distribution over the
+parameters. Maximizing the ELBO with φ is equivalent to minimizing the KL
+divergence between q(θ|D; φ) and p(θ|D), thereby providing a lower bound on the
+log marginal likelihood log p(D) ≥ ELBO(φ), after the parameters θ have been
+integrated out. By optimizing the variational parameters φ, we simultaneously
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0064.txt b/assets/txts/pg_0064.txt
new file mode 100644
index 0000000000000000000000000000000000000000..01fd9713c535f08dac973523a57d7a963aef800e
--- /dev/null
+++ b/assets/txts/pg_0064.txt
@@ -0,0 +1,45 @@
+32
+
+FUNDAMENTALS
+
+fit the model to the data well and ensure that the approximate posterior is
+encouraged to be as close as possible to the true posterior distribution.
+Even a non-Bayesian, classic NN can be interpreted in this framework as an
+approximate, degenerate posterior distribution, i.e., a Dirac delta function
+centered on the MAP estimate of the parameters, q(θ|D; φ) = δ(θ − θ̂MAP ).
+More PUQ methods based on different posterior approximations are discussed
+in detail in Chapter 3, with additional updates on the state-of-the-art.
+
+2.2.6
+
+Failure Prediction
+
+Based on the principle of selective prediction [138, 139], failure prediction is
+the task of predicting whether a model will fail on a given input. In every chapter
+following Chapter 3, this topic is addressed in the context of the respective
+task. Since it is an important topic in the context of IA-DU that is generating
+increasing interest [81, 114, 127, 193, 391], it warrants a brief overview of
+how it provides a unified perspective. We refer the reader to [171, 536] for a
+comprehensive survey.
+Failure prediction subsumes many related tasks in the sense that it requires
+a failure source to be defined to form a binary classification task. The failure
+source can be i.i.d. mispredictions, covariate shifts (e.g., input corruptions,
+concept drift, domain shift), a new class, domain, modality, task, or concept.
+The goal of failure prediction is to predict these failures before they occur,
+allowing for more reliable and robust ML systems.
+First, note that calibration does not imply failure prediction, as a calibrated
+model w.r.t. i.i.d. data can still be overconfident on OOD inputs [549]. The
+example in Example 2.2.1 sketches the independent requirements of calibration
+and confidence ranking.
+Example 2.2.1. Classifier A scores 90% accuracy on the test set, with a CSF
+using the entire range [0, 1]. Classifier B scores 92% accuracy on the test set,
+but the CSF always reports 0.92 for any input. Which classifier is preferred in
+a real-world setting?
+• Classifier A is calibrated, but it is not possible to know whether it will
+fail on a given input.
+• Classifier B might be less calibrated, but the CSF allows separability to
+predict failure on a given input.
+Specific to OOD failure prediction, [527] provides a comprehensive categorization
+of failure tasks and methods.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0065.txt b/assets/txts/pg_0065.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2db7ae7395902cbed3e0e287bac4cc0be927e75c
--- /dev/null
+++ b/assets/txts/pg_0065.txt
@@ -0,0 +1,46 @@
+DOCUMENT UNDERSTANDING
+
+2.3
+
+33
+
+Document Understanding
+
+This Section focuses on the history and definition of DU as a field of AI.
+Like all subfields of AI, DU has been evolving rapidly, and the definition of a
+document has been changing accordingly. We identify three main stages in the
+evolution of the field, dependent on a) the type of learning, b) the unit of study,
+and c) the modality of the input.
+Regarding a), it has followed the natural evolution of rule-based systems, to
+learning-based systems, to deep learning systems to build representations of
+documents. Regarding b), the field has evolved from region-based analysis, to
+page-level analysis, and now moving to document-level analysis, as we have
+advocated in our research (Chapters 4 and 5). Regarding c), the field was
+originally dominated by OCR, particularly CV, then by KIE, emphasizing NLP,
+and now by both CV and NLP, with more attention given to multimodality and
+generative models by which new tasks can be approached, e.g., DocEdit [311].
+Below, we expound on the evolution of the field through the lens of each
+modality, and the tasks that are typically associated with it. We also provide
+an overview of the most popular datasets and models in each task/modality.
+The term Document Understanding (DU) is used in a variety of contexts
+(historical, research, commercial), and its definition deserves some attention. A
+seminal reference [430] dates back to 1992, which defines DU as ‘the study of all
+processes involved in taking a document through various representations’: from
+a physical object to a digital image, from an image to a symbolic description,
+and from a symbolic description to a high-level semantic representation. At the
+time, the field was dominated by Optical Character Recognition (OCR),
+particularly CV, and the definition was focused on the physical-to-digital
+conversion of documents, excluding born-digital documents.
+Furthermore, the subterm document is used in the context of NLP (in
+particular in summarization) to denote a textually-rich document: a sequence of
+words exceeding a sentence or paragraph or a single unit in a corpus. However, in
+DU it denotes a visually-rich document (VRD), which can be a combination
+of text, images, tables, and other elements. There is no universally established
+definition of a document [53], and it is used interchangeably with the term
+page, which is a physical, symbolic unit. In Chapter 4, we come back to this
+definition, addressing the misalignment of research with how documents occur
+in practice.
+Over time, the quality of OCR has improved, and the focus of the field has shifted
+from OCR to document image classification (DIC) and key information
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0066.txt b/assets/txts/pg_0066.txt
new file mode 100644
index 0000000000000000000000000000000000000000..752821efcb4a90eb2db332ecac04d29ec01189d7
--- /dev/null
+++ b/assets/txts/pg_0066.txt
@@ -0,0 +1,62 @@
+34
+
+FUNDAMENTALS
+
+OCR
+
+DIC
+document type: invoice
+KIE
+document number: 29069
+document date: 12/21/2020
+DocVQA
+How much should be paid? $459.90
+
+f Invoice gro groSolar ee et 601 Old River Road, Suite 3 ome White
+River Junction, VT 05001 (B00) 374-4494 Hagan, Tilden ; John
+Hagan SolarDyne, LLC Reidsville NC 27320 United States
+created fram: PO # Terms Due Date Ship Via Memo Sales Order
+#16705 SMA Sensor Box Net 30 1/20/2011 Best Way... SMA Sensor
+Box SMA Ambient SMA Ambient Temperature Sensor, for 1 37.38
+o 37.38 Temp Sensor use with Sensor Box . SMA Sunny SMA
+Sunny Sensor Box 1 370.86 o 370.86 Sensor Box Terms subject to
+published terms and conditions provided previously. Payment is
+due on terms noted above. Customer is responsible for full cost,
+including interest and cost of collections should it be necessary.
+Additional shipping casts may apply. 408.24 Thank you for your
+business. : 9.51 31.64 $449.39 4 dv —— wd te ss Tie, wy Ss NSS ) /
+WV Re * LL Rel TTOWS- OB613 groSolar Remittance Slip
+[Customer —————~*dXi owe _[Amountbue [AmountPad Make
+checks payable to: proSalar PO Box 6144 Brattleboro, VT
+05302-6l4d
+
+DOD:
+logo: (136,313; 313,432)
+handwriting: (493,2133; 2063,2523)
+
+Figure 2.4. A simple illustration of common DU tasks on an example document.
+
+‘’
+extraction (KIE), which are more application-directed recognition tasks.
+Arguably, most businesses are interested in the unstructured information
+contained in documents, rather than the documents themselves. On the
+commercial side, the combination of these tasks is often referred to as
+Intelligent Document Processing (IDP), albeit ‘understanding’ has been
+similarly marketed by e.g., UIPath (originally an RPA company, now looking
+at AI as the next frontier of automation). The scientific community has been
+more careful in using the denomination ‘understanding’ [29], with the DUE
+benchmark [47] defining it, on the one hand, as an end-to-end process involving
+a subset of human cognitive skills, and on the other hand, enumeratively with
+several well-defined problems (OCR, KIE, VQA as defined in Section 2.3.1).
+In our research, we have extended DU to denote ‘the ability to holistically
+consume textual and visual elements structured according to rich semantic
+layouts, and reason over compositional information extracted from a VRD to
+generate meaningful insights or actions.’. There is no specific notion of tasks,
+but rather an emphasis on the end-to-end process leveraging all modalities
+intrinsic to documents, where a generic DU model is expected to generalize
+to any task on any document from any domain. This stands in shrill contrast
+to only DIC and KIE, where local context generalization (key-value pairs) is
+rewarded, whereas DU as defined here aims to generalize beyond the local
+context of a document.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0067.txt b/assets/txts/pg_0067.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bc14cf1cb8b12d804f110bcf1063380db1c555b5
--- /dev/null
+++ b/assets/txts/pg_0067.txt
@@ -0,0 +1,50 @@
+DOCUMENT UNDERSTANDING
+
+2.3.1
+
+35
+
+Task Definitions
+
+For thorough understanding, each task will be defined in terms of the following
+components: input, output, model, and evaluation. Most tasks use a single
+document page as input (for both legacy and computational reasons), and the
+output depends on the task.
+Formally, a page p consists of an image v ∈ RC×H×W (number of channels,
+T
+height, and width, respectively) with T word tokens u = {wt }t=1 , where wt
+maps to (sub)words in a vocabulary V, organized according to a layout structure
+
+ T
+s = x1t , yt1 , x2t , yt2 t=1 , typically referred to as token bounding boxes (top-left
+to bottom-right corner), coming from OCR or available from a born-digital
+document. Standardized notation for document inputs beyond a single page
+has been established in Chapter 4 [470].
+Optical Character Recognition (OCR) is the task of converting a document
+image to a sequence of characters. The input is a document image, and the
+output is a sequence of characters. The output space Y is the set of all possible
+characters (e.g., a, b, c, ..., A, B, C, ...), typically restricted to a subset of
+characters based on the document language and orthography. The quality is
+evaluated with a metric such as the word error rate (WER) or the character
+error rate (CER).
+Document Classification (DC) is the task of assigning a document to a
+predefined class. The input is a document image, and the output is a class
+label. The output space Y is the set of all document classes (e.g., invoice, email,
+form, advertisement). Standard metrics are accuracy and F1 score (if class
+imbalance).
+Key Information Extraction (KIE) is the task of extracting key information
+from a document. The input is a document image, and the output is a set
+of key-value pairs. The output space Y is the set of all key-value pairs (e.g.,
+date: 2024-01-01, total: 1000.00, ...), where keys are pre-defined as part of a
+format relevant to the document class in scope. In practice, it is implemented
+as sequence labeling with y = {y1 , y2 , ..., yT }, where yt ∈ Y is a label from a
+IOB,IOBES-encoded labelset Y ( B-DATE, I-DATE, ..., O). Extraction quality
+is evaluated with the sequence F1 score to account for the imbalance with the
+‘O’ token.
+Document Visual Question Answering (DocVQA) is the task of answering
+a question about a document. The input is a document image and a question,
+and the output is an answer. Depending on the type of question, the output
+space changes. Extractive questions (ExQA) require a subspan of the document’s
+text as answer, y = (ystart , yend ) with ystart ≤ yend and ystart , yend ∈ {1, ..., T }.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0068.txt b/assets/txts/pg_0068.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d6573c682990444ae89e6237901770e3dc39adc1
--- /dev/null
+++ b/assets/txts/pg_0068.txt
@@ -0,0 +1,47 @@
+36
+
+FUNDAMENTALS
+
+Abstractive questions (AbsQA) require a sequence of tokens as answer, y =
+{y1 , y2 , ..., yT 0 } with yt ∈ V. The latter is more complex to evaluate, yet more
+interesting to test ’understanding’ than restricting evaluation to answer spans,
+which is why we introduced AbsQA as part of Chapter 5. Orthogonal to the
+previous two types, DUDE introduces list questions with multiple or multi-span
+(ExQA) answers. Predicted answers are evaluated using ANLS, with multiple
+extensions defined in Section 2.2.3.
+Document Layout Analysis (DLA) is the task of analyzing the layout of a
+document in terms of logical layout elements (e.g., text blocks, headers, figures,
+figure, plots, tables, text). The input is a document image, and the output is
+a set of bounding boxes and their respective labels. The output space Y is
+the set of all possible bounding boxes and labels. More formally, it outputs
+a set of tuples, where each tuple (bj , cj ) represents one of J detected logical
+layout elements. For each, bj denotes the bounding box for the j-th detected
+element, defined as (xj , yj , wj , hj ) (in the popular COCO format). cj is the class
+label for the j-th element, indicating its object category. Evaluation is done
+with the standard COCO metrics, i.e., average precision (AP) over different
+intersection-over-union (IoU) thresholds, and mean AP (mAP).
+Document Generation (DG) is the task of generating a document from a set
+of key-value pairs and potential metadata attributes, e.g., visual appearance,
+color scheme. The output space Y is the set of all possible document images,
+which makes it hard to evaluate in a quantitative manner. Some efforts have
+been made to define metrics for document generation, e.g., Document Earth
+Mover’s Distance [169], but they are not yet widely adopted.
+Other lesser known tasks include document object detection (DOD), table
+structure recognition (TSR), document retrieval, document editing, document
+translation, document summarization, document authenticity verification With
+the rise of multimodal models, more data types are being considered jointly
+with documents under the umbrella term visually-situated language, such as
+charts, tables, handwriting, text-heavy scenes or illustrations, webpage and user
+interface screenshots etc.
+
+2.3.2
+
+Datasets
+
+With the variety of tasks, there is a large number of datasets available for each
+DU task. Instead of exhaustively enumerating datasets for each task defined
+above, we will link to the tables in the respective chapters treating these tasks.
+We will only highlight some more recent datasets, which are not yet included in
+the tables.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0069.txt b/assets/txts/pg_0069.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5b45ce2ed74b523a59e75a774b256ce90bc38f39
--- /dev/null
+++ b/assets/txts/pg_0069.txt
@@ -0,0 +1,45 @@
+DOCUMENT UNDERSTANDING
+
+37
+
+An overview of document source datasets for pretraining or dataset construction
+is presented in Table 4.1 as part of Chapter 4.
+For an overview of DC datasets, see Table 4.2 in the same chapter. For an
+overview of KIE datasets, we refer to [47], with some newer datasets [422, 485]
+linked here. An overview of DocVQA datasets is presented in Table 5.1, with
+the introduction of the DUDE dataset (Chapter 5). An interesting new addition
+is PDFTriage [400] which focuses more on retrieval than on QA. Finally, some
+datasets for DLA are presented in Table 6.1 as part of Chapter 6. Other essential
+datasets are PubLayNet [544] and DocBank [261]; and the novel multidomain
+M 6 dataset [71].
+
+2.3.3
+
+Models
+
+A model taxonomy is presented in [407] that differentiates models based on the
+input modalities they use, the geometric approach, dependence on OCR, or
+the type of output they produce. However, it is far from comprehensive due to
+missing out on various DU tasks and more recent models. Table 2.2 presents
+an overview of models that we have applied to various DU tasks, extending the
+taxonomy with our observations.
+Depending on the modalities considered and the requirements of the task,
+different pretrained models have been used in practice, instead of the document
+foundation models presented above.
+For document text, the most popular models are BERT [95], RoBERTa [287],
+and T5 [383]. Additionally, text-only LLMs such as GPT-3 [52], Llama [452],
+and Mistral [199] are increasingly applied to document text.
+For document images, the most popular models are ResNet [167], EfficientNet
+[439], and DiT [259].
+For all modalities combined, the most popular models are the LayoutLM series
+[187, 502, 503], DocFormer(v2) [15, 16], and UDOP [443]. The former are OCRbased pipelines, with pixel-only models such as Donut [216] and Pix2Struct
+[247] gaining popularity for increased efficiency, albeit they are still catching
+up on performance. Alternative approaches include the use of graph neural
+networks [286, 341, 517] and grid-based models [212, 275], yet their performance
+lags behind the aforementioned sequence models.
+Most of the above-mentioned models have been applied during the Chapter 5
+benchmark experiments, with only results missing for multimodal LLMs, which
+were introduced after the publications of the chapter. An up-to-date overview of
+newer multimodal LLMs, e.g., GIT2, PaLi, Flamingo, Kosmos-2, GPT-4, Fuyu,
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0070.txt b/assets/txts/pg_0070.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ade6cfd953637b18afa640aee3ac36b7fe39804b
--- /dev/null
+++ b/assets/txts/pg_0070.txt
@@ -0,0 +1,202 @@
+38
+
+FUNDAMENTALS
+
+Model
+LayoutLMv1 [502]
+DocStruct [484]
+StrucText [266]
+StructuralLM [254]
+LayoutLMv2 [503]
+SelfDoc [263]
+LamBERT [134]
+TILT [371]
+DocFormerv1 [15]
+UniDoc [153]
+DiT [259]
+LayoutLMv3 [187]
+BROS [181]
+XYLayoutLM [154]
+FormNet [245]
+ERNIE-Layout [264]
+LiLT [481]
+XDoc [66]
+GeoLayoutLM [296]
+Vision Grid Transformer [80]
+DocFormerv2 [16]
+Donut [216]
+Pix2Struct [247]
+UDOP [443]
+Hi-VT5 [451]
+FormNetv2 [246]
+LayoutMask [458]
+UReader [510]
+DocLLM [480]
+Gramformer [44]
+InstructDoc [442]
+
+Year
+2020
+2020
+2021
+2021
+2021
+2021
+2021
+2021
+2021
+2021
+2022
+2022
+2022
+2022
+2022
+2022
+2022
+2022
+2023
+2023
+2023
+2022
+2023
+2023
+2023
+2023
+2023
+2023
+2024
+2024
+2024
+
+Conf.
+KDD
+EMNLP
+ACM
+ACL
+ACL
+CVPR
+ICDAR
+ICDAR
+ICCV
+NeurIPS
+ACM
+ACM
+AAAI
+CVPR
+ACL
+EMNLP
+ACL
+EMNLP
+CVPR
+ICCV
+ECCV
+ICML
+CVPR
+PatRecog
+ACL
+ACL
+ACL
+-
+
+Arch.
+E
+E
+E
+E
+E
+E
+E
+E+D
+E
+E
+E
+E
+E
+E
+E
+E
+E
+E
+E
+E
+E+D
+E+D
+E+D
+E+D
+E+D
+E
+E
+D
+D
+E+D
+E+D
+
+Input Mod.
+T+S
+T+V+S
+T+V+S
+T+S
+T+V+S
+T+S
+T+V+S
+T+V+S
+T+V+S
+V
+T+V+S
+T+S
+T+V+S
+T+V+S
+T+S
+T
+T+V+S
+T+V+S
+T+V+S
+V
+V
+T+V+S
+T+V+S
+T+V+S
+T+S
+V+S
+T+S
+T+V+S
+T+V+S
+
+Vision Branch
+Resnet50
+Resnet50 + FPN
+ResNeXt 101
+U-Net
+Resnet50
+Resnet50
+ViT
+Linear
+ResNeXt 101
+F-RCNN
+F-RCNN+ConvNeXt
+ViT
+Linear
+SwinTransformer
+ViT+variableres
+ResNeXt 101
+ViT
+3-layer CNN
+CLIP-ViT
+Linear
+CLIP-ViT
+
+Table 2.2. Adapted from [16]. A summary of DU prior art is presented with their
+architecture (E: Encoder, D: Decoder), the input (T: text, V: vision, S: spatial features),
+the vision features branch and core extensions.
+
+Llava, CogVLM, that could potentially be applied to DU tasks is presented in
+[512].
+
+2.3.4
+
+Challenges in Document Understanding
+
+To tease the contributions of our works, we will highlight some of the most
+important challenges in DU, which are shared by all chapters in this thesis.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0071.txt b/assets/txts/pg_0071.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8a6776242fb0e896ea4ca488fc0dfbb504ccea35
--- /dev/null
+++ b/assets/txts/pg_0071.txt
@@ -0,0 +1,103 @@
+DOCUMENT UNDERSTANDING
+
+2.3.4.1
+
+39
+
+Long-Context Modeling
+
+An important challenge for most SOTA DU models based on the Transformer
+architecture is long document processing, which is not yet solved satisfactorily,
+as it is the focus of Chapters 4 and 5.
+We illustrate the extent of the problem with the most popular DU model,
+LayoutLMv3 [187]2 in Figure 2.5, pointing to the quadratic complexity of
+attention, which cannot be parallelized over pages with encoder-only models.
+Hi-VT5 [451] is the only model that is by design usable for multipage documents,
+yet it requires a lot of memory and depends on compressing page information
+into learnable embeddings.
+xT
+d
+
+Text Embedding
+
+xS
+
+|V| Embeddings
+
+xL
+
+OCR_0...M
+
+S Token Embeddings
+
+Token
+
+Layout Embedding
+
+Bbox
+
+OCR
+
+1D Position Embeddings
+
+Transformer Block
+
+Feed Forward
+
+Add & Norm
+Dropout
+
+[CLS] pooling
+
+Feed Forward
+
+Add & Norm
+Dropout
+
+Image Embedding
+Patching
+
+Img_0...I
+
+Linearize
+
+Dropout
+Softmax
+
+2D Position Embeddings
+
+Multi-​
+Head
+Attention
+
+H x A
+
+2
+
+(T+M)
+
+Patch Embeddings
+
+xM
+
+Figure 2.5. Inefficiency of document foundation models for processing multipage
+documents, illustrated with LayoutLMv3 [187]. Notation: L pages, T text tokens, M
+linearized visual patches, S Transformer layers
+
+While a page is the modeling unit of preference to maintain computational
+efficiency in Transformers’ processing sequences of tokens, it is not the natural
+appearance of a document. Some tasks require the global document context
+and treating each page contextually independent is suboptimal, as argued in
+our works on multipage document classification (Chapter 4) and DocVQA
+(Chapter 5) with multi-hop question answering.
+Figure 2.6 illustrates how a prototypical multimodal architecture, Hi-VT5 [451],
+is used for the task of multipage ExVQA.
+In principle, every LLM can perform multipage document processing depending
+on the ability of the LLM to extrapolate to longer context windows, given
+the position representation method (barring absolute positional encodings),
+and performance relying on also having trained on long sequences, e.g., by
+2 >8.6M
+
+model weights downloads in January 2024
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0072.txt b/assets/txts/pg_0072.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b64a49d6331dcf5aa1a27de3c9cb156e2209e8a1
--- /dev/null
+++ b/assets/txts/pg_0072.txt
@@ -0,0 +1,85 @@
+40
+
+FUNDAMENTALS
+
+$8,834.17
+Quantity
+
+Extractive
+
+Answer type
+module
+
+Question type
+module
+
+Answer Decoder
+
+<PAGE>'_0
+
+Q'_0...m
+
+OCR'_0...T
+
+Img'_0...V
+
+<PAGE>'_1
+
+...
+
+Document Encoder
+
+Img'_1...V
+
+OCR_1..T
+
+Bbox
+Token
+
+OCR'_1...T
+
+Document Encoder
+
+OCR_0...T
+
+<PAGE>_0
+
+Q'_1...m
+
+Bbox
+Img_0...V
+
+<PAGE>_1
+
+Token
+
+Img_1...V
+
+Q
+How much does Solardyne
+still owe GroSolar?
+
+...
+A
+
+Figure 2.6. Hi-VT5 architecture for multipage, extractive DocVQA.
+
+instruction-tuning on long-context data. Naturally, the computational cost will
+increase with the length of the input data, yet recently many advances have
+made subquadratic complexity feasible (e.g. relative positional encodings [382],
+ALiBi [374], Flashattention [82], multi-query attention [9] etc.). [102] provides
+an overview of the SOTA in long-range Transformers for DU tasks. A recent
+approach [44] proposes a hierarchical architecture to model both local pagelevel attention and global document-level attention on learnable document-level
+tokens, with an additional compression module to scale to 100+ pages while
+keeping latency low.
+2.3.4.2
+
+Document Structure Modeling
+
+Representing structured documents as plain text resulting from OCR is not
+congruent with how humans perceive documents [294], which is the focus of
+Chapter 6. Document layout is a valuable cue to navigate a document’s structure
+and find information more efficiently, but it is not always modeled properly,
+with most methods relying on geometric features (1D/2D absolute positional
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0073.txt b/assets/txts/pg_0073.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ed453cdfddf8fc729002479e4c4474f5b6a6d540
--- /dev/null
+++ b/assets/txts/pg_0073.txt
@@ -0,0 +1,47 @@
+INTELLIGENT AUTOMATION
+
+41
+
+encodings) that are not robust to OCR errors, nor are they able to capture the
+semantic complexity of document layouts.
+There are great recent advances from better layout modeling, e.g., modeling
+relative positions with polar coordinates and layout attention with Gaussian
+biases [555], and DocLLM [480] ignoring visual features to focus on disentangling
+the layout structure from the document text, which are promising directions
+for future research.
+
+2.4
+
+Intelligent Automation
+
+Automation is the use of technology to perform tasks with reduced human
+assistance.
+Throughout history, humankind has experienced waves of
+automation, from the invention of the wheel to the steam engine, the assembly
+line, and the computer. Manual labor in particular, performed by blue-collar
+workers, has been increasingly automated since the 20th century. When applied
+to knowledge work as performed by white-collar workers, more through the use
+of software than hardware, it is referred to as Intelligent Automation (IA,
+not to be confused with the French acronym of ‘intelligence artificielle’) [1].
+IA is a rapidly growing field, with the market for hyperautomation-enabling
+technologies projected to have reached nearly $ 600 billion in 2022, a 24%
+increase from 2020 [392]. A recent survey [135] does show that IA adoption is
+lagging behind expectations, with only 19% of organizations having deployed
+their automation programs and 38% in the planning stage.
+[48] identified 5 key trends in IA: 1) the rise of the digital workforce, 2) the
+emergence of the digital twin, 3) the importance of data, 4) the need for
+orchestration, and 5) the rise of the citizen developer. The first three trends
+are particularly relevant to the work presented in this thesis.
+IA is a subset of Artificial Intelligence (AI) specifically designed for the
+automation of knowledge work. It encompasses several technologies, including
+Robotic Process Automation (RPA), which can be thought of as software
+to automate routine tasks, and Workflow & Business Process Management
+(BPM). When combined with people and organizations, these technologies are
+capable of solving major world problems [48].
+The goal of IA is to create a software-based digital workforce by mimicking
+the four main human capabilities required to perform knowledge work: vision,
+language, thinking & learning, and execution. This allows for the construction
+of straight-through business processes, which are more efficient in terms of
+productivity, processing speed, and cost, and often more effective in terms of
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0074.txt b/assets/txts/pg_0074.txt
new file mode 100644
index 0000000000000000000000000000000000000000..077f3f59fdf78e2694adcd83acb7e90fb437c61f
--- /dev/null
+++ b/assets/txts/pg_0074.txt
@@ -0,0 +1,33 @@
+42
+
+FUNDAMENTALS
+
+quality and logic. The ultimate aim is not to replace human workers, but to
+take the robot out of the human, augmenting human intelligence, creativity,
+and productivity.
+IDP/DU is a prototypical example of an IA use-case, as it frees workers from
+paperwork, allowing them to focus on more value-adding tasks, thereby providing
+a clear perspective on the future of work. Finally, we provide an overview of the
+requirements for setting up IA, linking back to all technical concepts introduced
+before.
+Enabling IA requires well-defined CSFs and either operational thresholding to
+determine the trade-off between automation and risk, or a selective prediction
+setup. When a system is deployed in production, it also requires robustness to
+distribution shifts, both expected and unexpected, and the ability to detect and
+predict a wide variety of failures.
+Measuring IA is performed using calibration metrics and confidence ranking
+metrics. Calibration is the degree to which a model’s predicted probabilities
+match the true probabilities of the events it predicts. Confidence ranking is the
+degree to which a model’s predicted probabilities are ranked in accordance with
+the true probabilities of the events it predicts. If the i.i.d. assumption becomes
+violated, the model’s confidence ranking will be affected, and the model will
+be overconfident on OOD inputs. As part of the deployment process, it is
+important to monitor the model’s performance and to detect when it starts to
+fail, where other metrics are more appropriate.
+Improving IA Improvements to IA can be made by inducing calibration
+through post-hoc strategies or designing calibrated loss functions, as well as
+through predictive uncertainty estimation for model selection and capturing
+issues with the data or model before deployment, and all investments in failure
+prediction will be rewarded with more robust and reliable systems.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0075.txt b/assets/txts/pg_0075.txt
new file mode 100644
index 0000000000000000000000000000000000000000..393c09d7fae9bf78c59749d67bebc1ad97748ad8
--- /dev/null
+++ b/assets/txts/pg_0075.txt
@@ -0,0 +1,8 @@
+Part I
+
+Reliable and Robust Deep
+Learning
+
+43
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0076.txt b/assets/txts/pg_0076.txt
new file mode 100644
index 0000000000000000000000000000000000000000..88f11ad0b8f5ed4fd785415980b3a0c8f86d6be6
--- /dev/null
+++ b/assets/txts/pg_0076.txt
@@ -0,0 +1,22 @@
+Chapter 3
+
+Benchmarking Scalable
+Predictive Uncertainty in
+Text Classification
+The contents of this chapter come from two publications [465, 466]:
+Jordy Van Landeghem, Matthew B Blaschko, Bertrand Anckaert, and MarieFrancine Moens. Predictive Uncertainty for Probabilistic Novelty Detection in
+Text Classification. In ICML Workshop on Uncertainty and Robustness in Deep
+Learning, 2020
+Jordy Van Landeghem, Matthew Blaschko, Bertrand Anckaert, and MarieFrancine Moens. Benchmarking Scalable Predictive Uncertainty in Text
+Classification. IEEE Access, 2022
+The first publication started as a reproduction of [500] with a deeper focus on
+text classification, and the second publication is a large journal extension of the
+first publication.
+This chapter focuses on how to quantify uncertainty in text classification
+tasks, which is a prerequisite to trust a model’s predictions in real-world
+applications such as intent classification in automated document processing
+based on the document text. We conduct a benchmarking study of uncertainty
+
+44
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0077.txt b/assets/txts/pg_0077.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8bfce3597f3e4b91372d0bb2c79eb3358705ff84
--- /dev/null
+++ b/assets/txts/pg_0077.txt
@@ -0,0 +1,44 @@
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+45
+
+estimation methods applied on 6 real-world text classification datasets, including
+both multi-class and multi-label classification, with 1-D convolutional neural
+networks and pretrained transformers. The experiments empirically investigate
+why popular scalable uncertainty estimation strategies (Monte-Carlo Dropout,
+Deep Ensemble) and notable extensions (Heteroscedastic, Concrete Dropout)
+underestimate uncertainty, and how to improve their performance. We motivate
+that uncertainty estimation benefits from combining posterior approximation
+procedures, linking it to recent research on how ensembles and variational
+Bayesian methods navigate the loss landscape.
+We find that our proposed method combination of Deep Ensemble with Concrete
+Dropout, by analysis of in-domain calibration, cross-domain classification, and
+novel class robustness, demonstrates superior performance, even at a smaller
+ensemble size. Our results corroborate the importance of fine-tuning dropout
+rate to the text classification task at hand, which individually and as an ensemble
+impacts model robustness. We observe in ablation that pretrained transformers
+severely underperform in novelty detection, limiting the applicability of transfer
+learning when distribution shift from novel classes can be expected.
+Supporting context: As the publications were written at the start of my PhD,
+we take the opportunity here to give an update on the state of the art and the
+relevance of our work in uncertainty estimation research.
+The journal extension was motivated as a survey and benchmark of scalable
+Bayesian Deep Learning methods, in which we introduced novel hybrid
+models and evaluated uncertainty estimation quality under distribution shift
+configurations. We also provide a convenient entry point for practitioners, as
+our benchmarking software is available online (https://github.com/JordyVL/uncertainty-bench). Our work has also been re-used as the basis of a
+conference tutorial [524, https://sites.google.com/view/uq-tutorial].
+In similar spirit as our work, new benchmarks have put different aspects of
+reliability and robustness to the test: Shifts [306] focuses on the robustness
+of uncertainty methods to real distribution shifts in large-scale tasks across
+overlooked modalities such as tabular, audio or sensor data, WILDS [220, 401]
+curates a collection of labeled and unlabeled datasets exhibiting distribution
+shifts in the wild, OpenOOD [527] generalizes a comprehensive benchmark
+for out-of-distribution detection, anomaly detection and open-set recognition,
+and finally, PLEX [455] probes pretrained models on their ability to estimate
+uncertainty, exhibit robustness under shifts, and adapt in settings of active,
+few-shot and life-long learning.
+The supremacy of ensemble methods has been challenged by the recent
+publication of [346], which proposes a new method for uncertainty estimation in
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0078.txt b/assets/txts/pg_0078.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bad1d53568433f0137cd1b81a47c9797f5198ae5
--- /dev/null
+++ b/assets/txts/pg_0078.txt
@@ -0,0 +1,47 @@
+46
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+NNs, called EpiNet. The authors claim that their non-Bayesian method is able
+to discern the difference between ambiguity or lack of data. Key ingredients are
+a dyadic sampling procedure, which creates interesting data pairs that are used
+to train a NN to predict the epistemic uncertainty, and a small architecture that
+can supplement any conventional NN to improve OOD detection and active
+learning [413]. Another competitive method [326] concentrates on feature-space
+density estimation under the assumption of smoothness and sensitivity, with
+their efficient baseline disentangling epistemic (Gaussian Mixture Model fit on
+training features, with a separate covariance matrix per class) and aleatoric
+uncertainty (entropy of softmax distribution). Other promising methods target
+aleatoric uncertainty, such as [75, 474] which focus on label noise or ambiguous
+tasks such as toxicity detection.
+An important observation on the benefits of Bayesian NNs concerns the dataset
+and model size, particularly Bayesian modeling shines in dynamic settings where
+the size of the model/data are unknown or change over time [346], e.g., online,
+continual, active and life-long learning. In static settings with high accuracy on
+a fixed test set, the benefits of Bayesian modeling are less pronounced [215].
+Next to PUQ, alternative approaches have sought to learn explicit scoring
+functions [200, 351] or assess the similarity of inputs to the training distribution
+[54, 271, 285, 379, 487]. All efforts have recently increased in popularity, as
+uncertainty estimation has become even more important for safe deployment of
+LLMs in user-facing applications [111].
+
+3.1
+
+Introduction
+
+Reliable uncertainty quantification is indispensable for any machine learning
+system trusted in decision-making in many application domains such as medical
+diagnosis, self-driving cars and automated document processing. In any typical
+industrial application, we desire predictive uncertainty to communicate on the
+model’s lack of in-domain knowledge due to either training data scarcity or
+model design errors, or its ability to flag potentially noisy, shifted or unknown
+input data (see [136] for more detail on sources of uncertainty).
+Supervised Deep Learning (DL) algorithms have been found to provide
+“catastrophically overconfident predictions” [116] under data distribution shift.
+Specifically, novel class distributions can emerge at inference time [367], which
+desirably should be detectable in a model’s uncertainty. To this end, scalable
+Bayesian DL (BDL) methods for uncertainty estimation have been recently
+developed, generating increased interest from practitioners in need of practical
+solutions. BDL comprises an increasingly large range of theoretically well-
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0079.txt b/assets/txts/pg_0079.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1d8c790af013c6057c75b8345a20a41adbd01ff9
--- /dev/null
+++ b/assets/txts/pg_0079.txt
@@ -0,0 +1,44 @@
+INTRODUCTION
+
+47
+
+motivated predictive uncertainty methods (PUQ), yet only some are able to
+scale in network architecture and dataset size. Additionally, most surveys
+and research output on predictive uncertainty is based on multi-class image
+classification or regression experiments. We argue that predictive uncertainty
+methods and how well they scale in Natural Language Processing (NLP), for
+text classification tasks, is still an under-explored question.
+The context of our study is a production-level text classification system for
+automatically handling incoming communications in information-intensive
+industries (e.g. legal, banking, insurance). Imagine a digital-first company
+where each department has its own document classifier operating under a closed
+world assumption. However, whenever a client mistakenly sends a document (car
+purchase invoice requesting a loan) to the wrong department (say underwriting
+or medical claims), this can generate high-confidence false positives that trigger
+the wrong action (insurance or claim settlement instead of loan application).
+Similarly, if an insurance broker suddenly decides to completely change the
+document template that clients use to apply for a car loan, the production
+model might not find previously salient features which it had learned to rely on
+for accurate classification. This shows that detection of anomalous inputs and
+shifting distributions is critical to keep errors in automation low.
+We investigate different techniques and procedures for incorporating uncertainty
+into DL models for text classification, analyzing the degree to which they can
+reliably capture uncertainty under extrapolation (outside the support of the
+training set), both individually and combined in an ensemble. Our findings for
+individual predictive uncertainty methods are overall consistent with benchmarks
+in other modalities, with Deep Ensemble reporting greater robustness than
+approximate Bayesian methods. However, we discover from empirical findings
+that our newly proposed combinations, particularly MC Concrete Dropout
+Ensemble, can push the bounds by exploiting the in-domain calibration effect of
+Concrete Dropout and all-round ensemble qualities for increased out-of-domain
+and novel class robustness.
+We intend our work to be used as a survey and benchmark of scalable BDL
+methods, where the architectures and datasets are drawn from NLP, thereby
+covering a void in the literature on uncertainty estimation in this field. Next to
+proposing a well-motivated evaluation methodology, this chapter also provides
+a convenient entry point for practitioners.1
+Our key contributions can be summarized as follows:
+• We conduct a benchmarking study of established uncertainty estimation
+1 Our benchmarking software [TensorFlow 2] is available at https://github.com/JordyVL/uncertainty-bench
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0080.txt b/assets/txts/pg_0080.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba6922f603a88fa4da9f7b0203994de76f6b2d99
--- /dev/null
+++ b/assets/txts/pg_0080.txt
@@ -0,0 +1,46 @@
+48
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+methods applied on real-world text classification datasets. Our analysis
+focuses on model robustness and uncertainty quality in realistic data
+distributions. We propose a practical methodology to test the above,
+resulting in a better understanding of the individual shortcomings of
+predictive uncertainty methods.
+• We motivate and introduce novel combinations of predictive uncertainty
+methods, providing empirical evidence for their complementary benefits.
+Through statistical analyses and ablation experiments we discern the
+importance of certain prior, model or hyperparameter influences on the
+reliability of predictive uncertainty.
+Organization The paper is organized as follows. Section 3.2 overviews
+related work in uncertainty benchmarking, distribution shift, and uncertainty
+estimation in NLP. We present core concepts of BDL in Section 3.3 to build
+up a thorough understanding of predictive uncertainty in theory and practice.
+We include this introductory text for readers less familiar with uncertainty
+methods. Section 3.3.5 critically analyzes the practice of evaluating uncertainty
+under distribution shift. Sections 3.3.4 and 3.4.1 stand central in our work,
+connecting recent research on how neural networks navigate the loss landscape
+with posterior approximation procedures, followed by our work’s hypotheses on
+complementary benefits between predictive uncertainty methods.
+Section 3.4 details our methodological setup from datasets, model architectures,
+uncertainty estimation and evaluation, to experimental settings. We present
+in Section 3.5 the results of 3 large benchmarking experiments, followed by
+4 smaller ablation studies on important hyperparameters. After closing the
+discussion in Section 3.6 with take-home messages targeting researchers and
+practitioners interested in uncertainty prediction in text classification, Section 3.7
+details additional experiments, and Section 3.8 draws up some limitations of our
+research. Finally, we synthesize our contributions in Section 3.9 and propose
+directions for future work on uncertainty research in NLP.
+The Appendices support the main text by detailing implementation (A),
+practical considerations (compute, timings) (B), and detailed evaluation data
+for full transparency (C).
+
+3.2
+
+Related Work
+
+In this Subsection, we overview recent literature on benchmarking the quality of
+uncertainty quantification in DL and more specifically research on uncertainty
+estimation for NLP tasks.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0081.txt b/assets/txts/pg_0081.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eb341f5319a7e28801cf477f8c01954fb53bf62c
--- /dev/null
+++ b/assets/txts/pg_0081.txt
@@ -0,0 +1,47 @@
+RELATED WORK
+
+49
+
+Increasingly, there are efforts from the research community to help BDL methods
+scale to real-world scenarios [205]. Benchmarks are an important tool to
+help researchers prioritize the right approaches and to inform practitioners
+which methods are suited for their applications [276]. There is a growing
+demand for benchmarking in BDL, since methods must be scored both for task
+performance and uncertainty quality [411, 496]. Rigorously evaluating the latter
+is considerably more difficult, since depending on the problem setting no direct
+uncertainty ground-truth exists, requiring a well-defined experimental setup
+[323].
+A standard benchmark in BDL is UCI [176], a set of curated regression datasets,
+which allows to judge uncertainty quality with the predictive log-likelihood
+metric. However, its general applicability and validity has been criticized on
+multiple accounts [113, 323, 360].
+More recently, [19, 113, 301, 348, 462] presented large-scale evaluation studies
+of BDL methods with benchmarking on real-world datasets. These studies
+motivate data retention and distribution shift as generic protocols for evaluating
+predictive uncertainty. Similarly, we argue that even mild shifts of data are
+unavoidable in real-world applications and, conditional to specific distribution
+shift assumptions (see Section 3.3.5), this provides a good testing ground for
+uncertainty evaluation.
+[348] consider two types of distribution shift: (a) out-of-distribution (OOD) data
+from separate datasets, and (b) adversarial shift, where the test distribution
+consists of perturbed or corrupted ground truth data isolated from training.
+In our work we propose novel class detection as an alternative to a), which we
+motivate to be a more representative experimental setup for testing uncertainty
+in text classification (more detail in Subsections 3.3.5 and 3.4.5.3). [142] bring a
+similar argument against b) that adversarial examples are often overly synthetic
+and disconnected from real-world performance concerns, which we assert to be
+especially true for perturbations applied to text data. Therefore, we derive a
+challenging experimental setup for b) (more detail in Section 3.4.5.2) inspired
+by the extensive literature in NLP on the problem of domain shifts and domain
+adaptation [45, 84, 129, 203, 388, 557]. Domain adaptation approaches aim to
+mitigate performance degradation that occurs when transferring a classifier from
+a source domain to a target domain. Learning under domain shift presents a
+complex challenge in text classification since linguistic patterns can be highly
+different across domains, even harder to tackle when domains are unknown a
+priori [388]. While out-of-domain generalization is the ultimate objective [18],
+we believe that accurate uncertainty prediction has a major role to play in the
+detection of out-of-domain data, which is currently under-explored. [488] is a
+notable exception where predictive uncertainty methods are leveraged to learn
+domain-invariant features in unsupervised fashion.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0082.txt b/assets/txts/pg_0082.txt
new file mode 100644
index 0000000000000000000000000000000000000000..69a6aeba724bd5f62c752e341020c0bd26ec88ae
--- /dev/null
+++ b/assets/txts/pg_0082.txt
@@ -0,0 +1,47 @@
+50
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+In this work we only consider methods that directly estimate the predictive
+posterior and aim at obtaining high quality uncertainty estimates by
+discriminative models without any additional OOD components. However,
+there exists a large number of alternative OOD detection and generalization
+approaches. We surmise that these can be more effective in handling the
+above distribution shifts, yet they have different modeling assumptions which
+complicates a direct comparison, for instance, access to (auxiliary) OOD data
+[271, 285], generative modeling [334], focus on abstention mechanisms [138], or
+characterization of dataset shifts with a two-sample-testing approach [379]. We
+recommend [54, 414] for an overview of these approaches.
+While previous BDL benchmarks have helped standardize protocols, metrics and
+analysis tools, the effort is not spent equally across all modality and problem
+settings (as can be observed in the survey of [4]). Arguably, most research on
+uncertainty estimation focuses on regression and image classification tasks as
+they offer visual validation on uncertainty quality, e.g., [214].
+Tasks in the NLP field involve discrete natural language units (word, sentence,
+paragraph) as input, which requires a translation to the continuous domain by
+embedding discrete units to form high-dimensional distributed representations
+[321]. This presents additional complexity compared to image or time-series
+data which as continuous signals can be directly fed into a Neural Network
+(NN). Furthermore, specialized algorithms (e.g., dealing with long sequences,
+attention for larger memory [473]) and progressively more complex architectures
+[27] are being created to tackle this unique challenge in NLP, which can affect
+the performance of predictive uncertainty techniques. With our work, we
+start the exploration into effects of field characteristics, notably different NLP
+architectures, inherent task complexity, and properties of language in text
+processing (e.g., ambiguity [397], document length [478], pre-defined vocabulary
+[68]) that could cause problems when predicting uncertainty. More specifically,
+we seek to answer how uncertainty research translates to a prototypical language
+task such as text classification, which more frequently than vision tasks is
+characterized by non-mutually exclusive labels [312], a problem setting ignored
+by existing BDL benchmarks.
+BDL research on NLP tasks is generally limited, certainly when considering
+quantitative evaluation of predictive uncertainty quality. While we draw
+inspiration from the uncertainty estimation methods of [500], their study focuses
+on the performance increase of non-probabilistic measures (mean-squared error)
+and only reports sentiment regression results. Moreover, we find no quantitative
+evaluation of the quality of the uncertainty scores and comparison to simpler
+measures of uncertainty, for instance, softmax score or predictive entropy. [174]
+does focus on the robustness of pretrained Transformers to distribution shift, yet
+without application of any predictive uncertainty methods. [322, 533] present
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0083.txt b/assets/txts/pg_0083.txt
new file mode 100644
index 0000000000000000000000000000000000000000..65f33eb738e4cd8897991e856b5f6c6be45d041c
--- /dev/null
+++ b/assets/txts/pg_0083.txt
@@ -0,0 +1,47 @@
+UNCERTAINTY METHODS
+
+51
+
+similar setups applying Monte Carlo Dropout to regular NLP architectures in an
+active learning setup, yet they only aim to increase overall predictive performance
+by relying on in-domain calibration. Our work benchmarks individual and joint
+predictive uncertainty methods in multiple text classification task settings
+over two well-motivated uncertainty evaluation setups, testing robustness to
+distribution shift for NLP problems.
+
+3.3
+
+Uncertainty Methods
+
+The first Subsection formally presents how to quantify uncertainty in BDL
+and how popular methods approach inference differently. Section 3.3.2 treats
+predictive uncertainty methods with a focus on the algorithmic procedure,
+followed by representative method extensions for more reliable uncertainty
+estimation. Section 3.3.3 describes from what sources uncertainty originates
+and how to quantify uncertainty at test-time. In Section 3.3.4 we present the
+rationale of our study, connecting recent research on how NNs navigate the
+optimization landscape with the posterior approximation procedure of methods
+from Section 3.3.2. Section 3.3.5 provides a critical note on how distribution
+shift impacts uncertainty estimation and the evaluation thereof.
+
+3.3.1
+
+Quantifying Uncertainty in Deep Learning
+
+In modern Deep Learning, two common uncertainty (or inversely “confidence”)
+estimates are the maximum posterior class probability, known as softmax-score,
+and the predictive entropy over posterior class probabilities [415, 522]. However,
+[156]’s work on confidence calibration demonstrated these to be unreliable
+estimates of Neural Networks’ uncertainty. While post-hoc calibration methods
+such as Temperature or Vector Scaling [156, 419] can easily calibrate classifier
+uncertainty in-domain (further discussed Section 3.3.5), they have been found
+to be less effective under increasing distribution shift [19, 348].
+Bayesian Deep Learning (BDL) methods build on solid mathematical
+foundations and hold promise for more reliable learned uncertainty estimates
+[496]. Drawing on the ground-laying works of [91, 179, 299, 300, 337], the
+“second-generation” in BDL [140] is geared towards finding practical and
+scalable approximations to the analytically intractable Bayesian posterior
+(Equation (3.1)). Inferring a prediction and the associated uncertainty for
+a new test input x∗ (with its associated label vector y ∗ ) requires computing the
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0084.txt b/assets/txts/pg_0084.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c7d4a938482c516a356f411ab6fe2a2cc237fe25
--- /dev/null
+++ b/assets/txts/pg_0084.txt
@@ -0,0 +1,60 @@
+52
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+conditional probability of x∗ given the training data D =
+P (y ∗ | x∗ , D) =
+
+Z
+
+
+
+x(n) , y (n)
+
+P (y ∗ | x∗ , D, θ) P (θ | D) dθ,
+| {z }
+
+
+
+N
+n=1
+
+,
+
+(3.1)
+
+posterior
+
+with θ representing all Bayesian Neural Network (BNN) parameters: weights w,
+biases b.
+In our study we will focus on two strategies with representative methods that
+circumvent the inference problem and have seen more widespread adoption
+given their ability to scale both in network architecture and dataset size.
+I. The weight snapshots direction, Deep Ensemble [238], which aims
+to find different sets of model parameters. Snapshots can be collected
+during different stages of training [133, 186, 301], or by using a sampling
+process such as Markov Chain Monte-Carlo (MCMC) [141, 180, 530]. II.
+The stochastic computation-graph direction, Monte Carlo Dropout
+[124], involves introducing noise over weights during training and estimating
+uncertainty with multiple stochastic forward passes. Recent works [283, 464]
+have proposed "single-model" uncertainty methods that ideally compute posterior
+uncertainty in one forward pass.
+Our work benchmarks representative methods from both categories (denoted by
+cursive), motivating a cross-category comparison and analyzing their individualjoint effectiveness in modeling predictive uncertainty.
+Additionally, we later experimented with alternative scalable uncertainty
+methods, namely stochastic gradient MCMC methods, cyclical SG-MCMC
+(cSG-MCMC) [530], and a single forward pass uncertainty method incorporating
+a Gaussian Process (GP) output layer, Spectral-normalized Neural Gaussian
+Process (SNGP) [283]. Results and discussion for these are included as a
+self-contained subsection Section 3.7.
+
+3.3.2
+
+Predictive Uncertainty Methods
+
+We will first introduce each method by explaining the algorithm, followed by
+advantages or identified shortcomings, with subsequent method extensions from
+the same procedure category. Finally, we will zoom in on how to quantify
+uncertainty using each method.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0085.txt b/assets/txts/pg_0085.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8384c3fbe3cb1fc2c9473bb09e31e394bdf73311
--- /dev/null
+++ b/assets/txts/pg_0085.txt
@@ -0,0 +1,67 @@
+UNCERTAINTY METHODS
+
+3.3.2.1
+
+53
+
+Monte Carlo Dropout
+
+The seminal work of [124] on Monte Carlo Dropout (MC Dropout, MCD) proposes efficient model uncertainty estimation by exploiting dropout regularization
+as an approximate Variational Inference (VI) method. In practice, the MCD
+procedure boils down to (i) applying dropout on all non-linear layers’ weights,
+and (ii) activating dropout both during training and evaluation. Quantifying
+“epistemic” model uncertainty using MCD involves sampling T stochastic weight
+sets from the variational Bernoulli distribution θ̂t ∼ q(θ) to calculate the
+lower-order moments of the approximate Gaussian posterior, respectively the
+predictive mean and variance (Equation (3.2)).
+µ̂pred (x∗ ) =
+
+T
+1X
+P (y ∗ |x∗ , θ̂t ),
+T t=1
+
+(3.2)
+
+T
+1X
+2
+σ̂pred
+(x∗ ) =
+[P (y ∗ |x∗ , θ̂t ) − µ̂pred ]2
+T t=1
+
+MCD’s simplicity and computational tractability, i.e., dropout training is a
+standard DL practice and prediction only requires 1 model to sub-sample from,
+has made it one of the most popular predictive uncertainty methods. However,
+an important shortcoming of VI, and in consequence MCD in [124]’s formulation,
+is that it is known to underestimate predictive variance [459]. We will touch on
+a selection of method extensions in Sections 3.3.2.3 and 3.3.2.4.
+3.3.2.2
+
+Deep Ensemble
+
+Deep Ensemble [238] (DE) involves independently training multiple NNs
+with different random weight initializations and aggregating predictions from
+individual models. An ensemble of NNs trades off computational resources,
+due to the need to train and store M models, for uncertainty estimation and
+robustness to dataset shift [163, 348, 489]. In comparison to MC Dropout,
+DEs are treated as a uniformly-weighted Gaussian Mixture model, to which the
+formula for predictive variance is adapted:
+2
+σ̂pred
+(x∗ ) =
+
+
+1 X 2
+σθm (x∗ ) + µ2θm (x∗ ) − µ2∗ (x∗ ),
+M m
+1 X
+µ∗ (x ) =
+µθm (x∗ )
+M m
+∗
+
+(3.3)
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0086.txt b/assets/txts/pg_0086.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ed9b964b92e085013e8dea868e5a35ad317d908c
--- /dev/null
+++ b/assets/txts/pg_0086.txt
@@ -0,0 +1,50 @@
+54
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+The empirical performance increase of ensembles can be attributed to the
+diversity of uncorrelated errors between ensemble members [225]. Without
+functional diversity in sets of model parameters, posterior approximation quality
+will be lower (zero variance) and for this reason, ensemble diversity promotion
+is a promising avenue for further improvements [49, 196].
+Alternatively,
+the interplay between ensembling and regularization, "the effect of a prior",
+warrants more thought, since not regularizing risks overfitting, while too strong
+regularization risks constraining diversity (see Section 3.3.4).
+3.3.2.3
+
+Concrete Dropout
+
+[125] proposes a Continuous-discrete distribution relaxation to adapt and
+optimize the dropout probability p as a variational parameter using standard
+gradient descent. This overcomes the limitations of uncertainty underestimation,
+miscalibration, and the computational complexity of manually tuning layerwise dropout probability in deeper models [345]. By taking advantage of
+the reparametrization trick, the Concrete distribution approximation z̃ of the
+original Bernoulli random variable z conveniently parametrizes to a simple
+sigmoid distribution (φ = sigmoid) allowing for gradient-based optimization.
+Given a uniform random noise variable u and a temperature r, the expression
+varies with respect to the dropout probability p, which for p → 0.5 produces by
+a rate of 1r values approaching 1.
+
+
+1
+(log p − log(1 − p) + log u − log(1 − u))
+(3.4)
+z̃ = φ
+r
+Since the dropout probability characterizes the overall posterior uncertainty,
+Concrete Dropout can positively influence in-domain calibration at an almost
+negligible cost.
+3.3.2.4
+
+Heteroscedastic Extensions
+
+[213, 236, 500] proposed similar approaches to extend MC Dropout to allow
+measuring uncertainty information from different sources. Estimating inputdependent, “heteroscedastic aleatoric”, data uncertainty (detail Section 3.3.3)
+requires slightly modifying the model’s architecture and objective function
+following [213].
+Firstly, the output layer of model fθ̂ is extended with a set of learnable variance
+variables σ 2 per unique class output. The model’s output logits, v, are sampled
+from the stochastic output layer parametrized by N (fθ̂ (x), diag(σ 2 (x))). This
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0087.txt b/assets/txts/pg_0087.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cf3e7db05acea5bf4a62c497e83fda7b66f305a3
--- /dev/null
+++ b/assets/txts/pg_0087.txt
@@ -0,0 +1,61 @@
+UNCERTAINTY METHODS
+
+55
+
+model adaptation will be referred to as the heteroscedastic model. Fig. 3.1
+visualizes the difference in output layer design.
+
+Figure 3.1.
+Visualization of output layer blocks.
+The left block denotes
+standard softmax (multi-class) or sigmoid (binary/multi-label) output. On the
+right, the heteroscedastic model outputs a normal distribution N (µ(x), diag(σ 2 (x))
+parametrizing mean and variance by the logits coming from two separate preceding
+feedforward layers.
+
+Next, it requires incorporating a heteroscedastic loss:
+T
+K
+X
+1X
+(t)
+(t)
+exp vi,c − log
+exp vi,k
+LHET (θ̂) =
+log
+T
+t=1
+i=1
+N
+X
+
+!
++ log T (3.5)
+
+k
+
+with N the number of training examples passing through an instance t of
+the model fθ̂t (x) + σ (t) ((2 omitted for sampling superscript) to generate for
+(t)
+
+example i a sampled logit vector vi ∈ RK , where predicted value for class k,
+(t)
+vi,k ∈ R, and c the index of the ground truth class. The above loss formulation
+shares notation with a categorical cross-entropy objective, although the loss is
+(t)
+computed over T sampled logits vi perturbed with parameterized Gaussian
+noise. By learning to predict log variance over T dropout-masked samples, the
+model will be able to output high variance (uncertainty) for inputs where the
+predictive mean is far removed from the true observation, which by design has
+a smaller effect on the total loss.
+
+3.3.3
+
+Uncertainty Estimation
+
+In this Subsection, we will introduce sources of uncertainty, a categorization of
+uncertainty measures, and how uncertainty is quantified in practice.
+Total Uncertainty Classification models trained by minimizing negative loglikelihood quantify global uncertainty over class outcomes with entropy (H) over
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0088.txt b/assets/txts/pg_0088.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b3d00f6bebf93190db8d9b83421c62320812cd1a
--- /dev/null
+++ b/assets/txts/pg_0088.txt
@@ -0,0 +1,48 @@
+56
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+logits. Therefore, the entropy of the posterior predictive distribution provides
+a measure of the total uncertainty, which is a combination of model and data
+uncertainty [190]. Instead of entropy, posterior predictive variance can also be
+decomposed into model and data uncertainty using the law of total variance
+[92]. Decomposing total uncertainty into the different sources is beneficial for
+determining actions to evaluate the room for improvement.
+Model Uncertainty Epistemic uncertainty presents the inherent ignorance
+[345] of the model with regards to the true values for its parameters and
+structure after having seen the training data. Next to predictive variance,
+Mutual Information (MI) [426] has been proposed as a measure of epistemic
+uncertainty, as intuitively it captures the amount of information that would be
+gained about model parameters through “knowledge” of the true outcome [305].
+Data Uncertainty Aleatoric uncertainty captures the inherent stochasticity
+and noise in data. It can be further decomposed into a homoscedastic component,
+which represents constant noise over inputs such as the numerical accurateness
+of a measuring device, and heteroscedastic uncertainty representing inputdependent noise generated by class overlap, complex decision boundaries or
+label noise [92]. Heteroscedastic data uncertainty allows for the expression of
+instance-level uncertainty together with the best possible prediction.
+Here follows a categorization of the
+Uncertainty categorization
+uncertainty measures from methods (and combinations) of Section 3.3.2. We
+directly provide estimators for the theoretical quantities that are defined as
+either arising from entropy or variance-based uncertainty decomposition in
+[92]. To estimate for a new test sample x∗ the prediction and uncertainty of
+model fθ̂ (x∗ ) we typically seek to obtain the predictive posterior distribution
+P (y ∗ |x∗ , θ̂) over class membership probabilities with yk∗ ∈ {1, . . . , K}.
+T
+
+1X
+P (y ∗ |x∗ , θ̂t ),
+T t=1
+with prediction obtained after applying softmax/sigmoid function for sample
+t, p̂t = P (y ∗ |x∗ , θ̂t ). For Deep Ensemble, the above notations would require a
+change from T to M , but for consistency over quantity formulas, we maintain T
+to denote posterior sampling. For ease of notation, we define a helper entropy
+K
+X
+function on H(x∗ , ·) = −
+P (yk |x∗ , ·) log P (yk |x∗ , ·) with · an input argument
+For MC Dropout at inference time, we presume P (y ∗ |x∗ , θ̂) ≈
+
+k=1
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0089.txt b/assets/txts/pg_0089.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3abfb72e451dca948f69bf3df50fa1b2e98a3d68
--- /dev/null
+++ b/assets/txts/pg_0089.txt
@@ -0,0 +1,78 @@
+UNCERTAINTY METHODS
+
+57
+
+to the function.
+Quantity
+
+Softmax-score
+
+Predictive Entropy
+
+Mutual Information
+
+Formula
+exp fθ̂,k (x∗ )
+S = max PK
+k
+exp fθ̂,j (x∗ )
+j=1
+Hpred = H(x∗ , θ̂)
+
+I = Hpred −
+
+T
+1 X
+H(x∗ , θ̂t )
+T
+t=1
+
+Model Uncertainty
+
+2
+σ̂model
+=
+
+T
+1 X
+(p̂t − µ̂pred )2
+T
+t=1
+
+Data Uncertainty
+
+2
+σ̂data
+=
+
+T
+K
+1 X 1 X
+(t)
+vark (x∗ )
+T
+K
+t=1
+
+k=1
+
+For any classification model, it is possible to compute the softmax-score and
+predictive entropy. For multi-label classification, the softmax-score does not
+take into account multiple winning classes and a standard approximation2 would
+be to average over the sigmoid-scaled probabilities of predicted classes.
+Model uncertainty can be quantified with Monte Carlo integration or the
+aggregation of individual models [461]. In practice, it is quantified by either (a)
+calculating the average sigmoid/softmax variance over the predictive mean
+from MC samples (Equation (3.2)) or (b) computing the total variance
+from an ensemble mixture distribution (Equation (3.3)). Changing to the
+heteroscedastic extensions allows to quantify data uncertainty. More specifically,
+data uncertainty is quantified with as “surrogate” [500] the average over variance
+logits var = σ 2 (see Fig. 3.1). Whenever ensembling is applied where a single
+model estimates a quantity, one typically averages over the ensemble components’
+uncertainty.
+2 Intending to compare directly with multi-class results, averaging uncertainty estimates
+to obtain a single summary statistic for multi-label predictions is more straightforward than
+reporting class-wise results. In particular, the tested multi-label datasets share low average
+label cardinality, a high degree of label correlation, and a large set of unique classes (K > 50).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0090.txt b/assets/txts/pg_0090.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f58db5cfe5f5fba0bf71a3d542c96989c4bc8498
--- /dev/null
+++ b/assets/txts/pg_0090.txt
@@ -0,0 +1,47 @@
+58
+
+3.3.4
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+Motivating Hybrid Approaches
+
+This Subsection will motivate the theorized complementarity of VI-based and
+ensembling methods for improved uncertainty estimation and robustness.
+In light of the empirical success of Deep Ensemble, recent research [118, 496]
+raises an important question concerning the difference in function-space between
+variational Bayesian NNs (MC Dropout and extensions) and Deep Ensemble.
+Deep NNs are parametrized (typically non-linear) functions presenting a highdimensional non-convex optimization problem, which may concern widely
+varying curvature and many flat regions with multiple locally optimal points
+within each [255]. Applying an optimization procedure to a maximum-aposteriori (MAP) objective involves a search for parameter values (hypotheses)
+for which the loss function is low by navigating the high-dimensional loss
+landscape. Once model training converges, one ends up with a weight-space
+solution, representing a single mode of the parameter posterior . One such mode
+is a local optimum of the loss function L(θ), representing unique functions fθ
+as a set of NN parameters [133]. Each mode potentially marks a meaningfully
+different representation of the data.
+The true posterior is generally a highly complex and multimodal distribution,
+with multiple possible but not necessarily equivalent parametrizations θ able to
+fit the training data. To accurately quantify posterior uncertainty, we wish to
+capture as many modes or separated regions as possible [117, 496].
+Correspondingly, the common goal is to achieve reliable uncertainty and,
+following the BDL paradigm, one resorts to modeling a Bayesian posterior.
+What differs among the selected predictive uncertainty methods, is the form of
+the prior P (θ) over model parameters and likelihood P (D|θ) [336], from which
+to determine a procedure. Below we expound on the difference in posterior
+approximation procedure:
+• MC Dropout is a common VI procedure with Bernoulli dropout and Gaussian
+(L2) priors on weight-space, assuming a posterior Gaussian distribution
+from which to draw stochastic samples. VI-based methods tend to locally
+approximate uncertainty surrounding a single mode, intra-modal posterior
+approximation. Specifically, MC Dropout’s procedure can be interpreted as
+imposing a spike-and-slab parameter prior with peaked variance [333], which
+offers a plausible explanation for approximated uncertainty centered tightly
+around 1 mode.
+• An ensemble of NNs makes no direct assumptions on the form or distribution
+of the prior and just “obtains” different samples from the parameter posterior.
+It generates a series of MAP estimates which through inherent stochasticity
+in weight initialization and optimization end up at different regions in weight
+space, leading to functionally dissimilar but more or less equally accurate
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0091.txt b/assets/txts/pg_0091.txt
new file mode 100644
index 0000000000000000000000000000000000000000..38a31fcda902f34b4eb4926ef14ca0477965689b
--- /dev/null
+++ b/assets/txts/pg_0091.txt
@@ -0,0 +1,46 @@
+UNCERTAINTY METHODS
+
+59
+
+modes of the solution space. Due to randomness in the optimization, some
+solutions may be significantly worse than others as measured by different
+metrics (e.g., accuracy vs. calibration). Ensembles are effective at exploring
+the weight-space and by solving the MAP estimation problems converge to
+multiple modes [117, 149], allowing for inter-modal posterior approximation.
+Furthermore, by considering more possible hypotheses they will be better at
+approximating multimodal posterior distributions and avoid the collapse to
+a single mode [496].
+Combining both procedures is to generate a mixture over priors [119], which
+in itself is again a prior, all under the same likelihood function. There is no
+guarantee that a combination of methods from both procedures captures the
+true posterior, yet in our work we will empirically analyse if combining inter and
+intra-modal posterior approximation offers the hypothesized complementary
+benefits.
+
+3.3.5
+
+Uncertainty Calibration under Distribution Shift
+
+In this Subsection, we motivate the meaningfulness of evaluating uncertainty
+methods under distribution shift and what restricted assumptions one should
+reasonably specify to guarantee useful empirical results.
+We consider the problem of detecting out-of-distribution data from a trained
+classifier’s uncertainty. Let P S (x, y) and P T (x, y) denote two distinct
+distributions, respectively in-domain and out-of-domain. Further we assume
+the classifier f → [0, 1] trained on P S , whereas in the experimental setup we
+test on a mixture distribution P(S,T ) (x, y). Given an input x from the mixture,
+we test if the classifier’s uncertainty can be exploited to distinguish from which
+distribution the sample comes. To be clear, in this setting we expect to detect
+uncertainty arising from distribution shift and not from a lack of training data.
+It can be argued that there is a relationship between both, as having few
+in-domain samples complicates generalization, in turn increasing the chance of
+flagging a new data point as OOD.
+Uncertainty estimation is generally well-defined in the context of in-domain data
+with the standard assumption that samples are independent and identically
+distributed (i.i.d.). In this setting, evaluation is typically expressed in terms of
+calibration (Definition 8), particularly as statistical error with respect to the
+conditional expectation (Definition 7).
+To obtain a reliable probabilistic classifier in the traditional i.i.d. setting, explicit
+in-domain re-calibration approaches are effective [156, 229, 490]. However, there
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0092.txt b/assets/txts/pg_0092.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1695414ddc1821d956ae75a49885a709b9b4b9d1
--- /dev/null
+++ b/assets/txts/pg_0092.txt
@@ -0,0 +1,47 @@
+60
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+is no general principle which states that a classifier, however calibrated on
+P S , would be calibrated on OOD data from P T . Infinitely many possible
+shifts can violate the standard i.i.d. assumption at varying degrees of severity,
+affecting calibration and uncertainty estimation in unpredictable ways. With
+the aim of still being able to rely on a classifier’s uncertainty calibration to
+predict future generalization, there is a need to relax the i.i.d. assumption. An
+important condition for meaningful uncertainty estimation is to impose realistic,
+yet sufficiently restrictive assumptions on the nature of distribution changes
+and how P S and P T relate. The covariate shift [34, 418] assumption may be
+the most widely studied when the real-world data distribution differs from the
+training distribution.
+Recently, [354] formalized the problem of calibrated prediction under covariate
+shift with theoretical bounds on calibration transfer over domains. Critically,
+related works [104, 145, 335, 349, 483] prove with importance weighting that
+shared structure and high overlap in distribution support (or conversely, low
+domain divergence) is crucial to upper bound the increase of calibration error
+due to covariate shift. To put it plainly: while one cannot guarantee calibration
+on OOD data in the general case, if domains are reasonably close one can expect
+to retain (some if not most) benefits from in-domain calibration.
+Specific to our work, we consider two experimental settings (Section 3.4.5) with
+different distribution shift [320] between domains. Here we characterize each
+with the related distribution shift assumptions. (i) Cross-domain classification,
+where covariates differ P T (X) 6= P S (X), but label distributions are identical
+P T (Y |X) = P S (Y |X) [418]. (ii) Novelty detection, where label distributions
+disagree P T (Y |X) 6= P S (Y |X), since the label sets differ between domains
+[Y ]T =
+6 [Y ]S [307]. Whereas (i) is a clear case of covariate shift, we reasonably
+assume for (ii) that covariates are generally close P T (X) u P S (X) and that
+the overall conditional shift will be small. Rather than interpreting novelty as a
+shift in label sets, one might define the probability of seeing some labels under
+S as exactly zero, while under T their probability is ε > 0. In practical text
+classification settings, novel class inputs will typically start occurring with small
+frequency in the real-world data distribution, as well as not having completely
+different syntax and semantics. This implies that ’excess’ calibration error
+(defined as an expectation over the mixture) will only be impacted slightly.
+Clearly specifying distribution shift assumptions is quintessential for reliably
+benchmarking uncertainty methods, since the calibration of each tested method
+can be affected in different ways and produce results biased towards an evaluation
+configuration. In our selected experimental settings, we can justify uncertainty
+calibration under distribution shift as a reasonable methodology, without making
+further claims on the general applicability of this evaluation procedure.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0093.txt b/assets/txts/pg_0093.txt
new file mode 100644
index 0000000000000000000000000000000000000000..80d14fcd3f8c23183a96226bd977b6f04df336c1
--- /dev/null
+++ b/assets/txts/pg_0093.txt
@@ -0,0 +1,49 @@
+EXPERIMENTAL METHODOLOGY
+
+3.4
+
+61
+
+Experimental Methodology
+
+In this work, our objective is to reliably benchmark both existing and novel
+combinations of predictive uncertainty methods in order to draw conclusions for
+text classification applications. This Section describes our study’s experimental
+methodology with which we generate the empirical evidence presented in
+Section 3.5. Section 3.4.1 introduces our hypotheses on complementary benefits
+for uncertainty estimation and details the hybrid methods. Provided the focus on
+text classification tasks, Section 3.4.2 motivates a set of representative datasets,
+with a specification of different text problem characteristics. Section 3.4.3
+documents two pre-selected text classification architectures, the first a simple
+and more controllable configuration for uncertainty benchmarking, the second
+a more complex NLP architecture for which we will compare relative gains
+in robustness. To ensure correct performance benchmarking, Section 3.4.4
+summarizes the metrics used for evaluating calibration and robustness. Finally,
+Section 3.4.5 expounds on the model setups and experimental settings devised
+to compare predictive uncertainty methods.
+
+3.4.1
+
+Proposed Hybrid Approaches
+
+This Subsection stands central in our work in which we motivate combinations
+of predictive uncertainty methods. We build hypotheses on complementary
+benefits from combining multiple uncertainty methods, for which we present an
+overview of hybrid methods in scope of our experiments (Table 3.1).
+Given the obvious parallels and differences between both procedures presented
+in Section 3.3.4, we hypothesize complementary benefits for uncertainty
+estimation and robustness.
+A. Whereas ensembles are adept at capturing multiple modes, they do not
+approximate uncertainty surrounding a single mode in solution space.
+However, since there is a lot of redundancy in function space, local
+neighborhood uncertainty approximation might make only a minimal
+contribution to the overall posterior uncertainty. [118] validated that
+applying subspace sampling on an optimized solution improves in-domain
+accuracy and calibration. They note improvements relatively lower
+than increasing ensemble size (M ), yet they did not analyze for joint
+effectiveness.
+B. A procedure can only be as good as the prior and the likelihood function,
+which in approximation of the intractable parameter posterior is limited by
+computational constraints (number of MC samples T , number of ensemble
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0094.txt b/assets/txts/pg_0094.txt
new file mode 100644
index 0000000000000000000000000000000000000000..274b4e937d5fc1fc891875b947192bcc015dd10f
--- /dev/null
+++ b/assets/txts/pg_0094.txt
@@ -0,0 +1,60 @@
+62
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+models M ). By lack of any specific prior constraining the optimization of
+independent ensemble members, the regularization effect from VI-based
+priors such as dropout may introduce smoothness [110, 369], inducing
+a simpler optimization landscape with less (possibly weak) hypotheses
+present. In turn, by modeling an ensemble of VI approximate posteriors
+less ensemble members could be required to reach the same in/out-ofdomain performance as measured by the size and quality of captured
+solutions. [118] already observed that ensembles saturate after reaching
+peak in-domain performance, with suboptimal models taking over the
+benefit.
+C. Important to note is that the influence of the prior and variational
+parameters requires fine-tuning, since over-regularization will reduce the
+optimization problem to one with an over-smooth, possibly unimodal
+landscape [117, 133]. This eliminates any functional diversity for whatever
+ensemble size, where the solution will be overconfident. Alternatively,
+since the hypothesis space for a NN is often so large, with many possible
+likely models for finite data, that some posterior collapse will often be
+desirable to reduce the number of considered hypotheses. [496].
+Table 3.1 summarizes all model setups and hybrid methods considered for
+our experiments. The most complete combination is MC Concrete Dropout
+Heteroscedastic Deep Ensemble, where each member m of the ensemble has
+optimized the layer-wise dropout rate p and heteroscedastic loss LHET , with the
+final predictive distribution over K classes deriving from M times T stochastic
+MC Dropout samples (M x T x K).
+Table 3.1. In total, we consider 18 model setups, based on combining methods and
+options from each column. (*) Deterministic dropout can only combine with Deep
+Ensembles. CE stands for cross-entropy loss.
+
+Dropout
+
+MC sampling
+
+Heteroscedastic
+
+Deep Ensemble
+
+p = 0*
+p = 0.5
+Concrete
+
+T =1
+T = 10
+
+LCE
+LHET
+
+M =1
+M =5
+
+We admit two baselines, Unregularized and Regularized.
+Unregularized (p = 0) offers a clean comparison, discounting any influence of
+sparsification (dropout) or normalization of weight magnitude (weight decay).
+However, it possibly overfits parameters to training data. In practice, one
+would always apply some combination of regularization (dropout, weight decay,
+batch normalization, data augmentation, ...) to counter overfitting. Regularized
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0095.txt b/assets/txts/pg_0095.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c64ea8b066891f6e091fb92236ec6e50767f3e1f
--- /dev/null
+++ b/assets/txts/pg_0095.txt
@@ -0,0 +1,104 @@
+EXPERIMENTAL METHODOLOGY
+
+63
+
+(p = 0.5) gives an alternate point of comparison over uncertainty methods, such
+that we can exclude that performance increase for an uncertainty method does
+not only come from regularization, which some such as MC Dropout rely upon.
+Adhering to good practices and since we build ensembles with default M = 5,
+we report the mean (and standard deviation) for all individual models, making
+the results more statistically reliable than comparing to 1 independently trained
+model.
+
+3.4.2
+
+Datasets
+
+We use six well-studied real-world text corpora characterized by a different
+number of classes, classification task, and size of the documents (Table 3.2).
+Table 3.2. D denotes the number of documents in the dataset, K the number of classes,
+I the class imbalance ratio [444], W the average number of words per document, V
+the total vocabulary size respectively.
+corpus
+
+task
+
+D
+
+K
+
+I
+
+W
+
+V
+
+20news
+IMDB
+CLINC-OOS
+Reuters ApteMod
+AAPD
+Amazon Reviews (#4)
+
+newswire topic
+movie review
+intent detection
+newswire topic
+academic paper subject
+product sentiment
+
+18,848
+348,415
+22,500
+10,786
+55,840
+8,000
+
+20
+10
+150
+90
+54
+2
+
+5e-4
+0.03
+0
+0.14
+0.04
+0
+
+240
+325.6
+8
+125.2
+145.4
+189.3
+
+212,267
+115,073
+6,188
+65,035
+66,854
+21,514
+
+The first three datasets share the task of multi-class classification in three
+different text domains.
+20News [239] is a collection of 20K newsgroup documents with balanced samples
+for 20 different newsgroups. To allow for direct comparison, we use the dataset
+in the benchmark format of [172].
+IMDB movie reviews [97] (imdb) is a large sentiment classification dataset
+which links user-based reviews of movies with labels on an ordinal scale between
+1 and 10. Since there are no standard splits for this dataset we generate
+randomized (seed 42) stratified splits of 65% for training, 15% validation and
+20% for testing.
+CLINC-OOS (CLINC150) [240] is a recently become popular intent detection
+dataset comprising 150 training sentences for each of the 150 system-supported
+services. Next to this, it offers a separate Out-of-Scope (OOS) subset with
+1200 natural sentences which can be used for Out-of-Domain (OOD) detection,
+more specifically detecting novel class instances. This dataset differs from the
+previous two through very short “intent” sentences requiring classification in a
+large output space. For training and evaluation, we use the predefined splits of
+TensorFlow Datasets.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0096.txt b/assets/txts/pg_0096.txt
new file mode 100644
index 0000000000000000000000000000000000000000..84eb5bcf5a65b0950a231ae721697379a61ce05a
--- /dev/null
+++ b/assets/txts/pg_0096.txt
@@ -0,0 +1,47 @@
+64
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+We include two popular multi-label text classification datasets, since they are
+often not considered for uncertainty experiments. We argue that they should
+be included since their multi-label nature is very common in text classification
+where not all labels have to be mutually-exclusive, e.g., topic categorization,
+subject attribution, ...
+Reuters ApteMod [17] is a multi-label news topic categorization dataset with
+90 possible topics and an average low label cardinality (C) of 1.24. We use the
+standard ApteMod splits.
+Arxiv Academic Paper Dataset (AAPD) [505] comprises 55,840 computer
+science paper abstracts that have been labeled with corresponding multiple
+subject matters. Each academic paper has on average 2.41 subject targets with
+a minimum of 2. For reproducibility purposes, we use the same preprocessing
+steps and splits as in [5, 505] with 1K dev and 1K test samples.
+Amazon Reviews [45] is a widely-used benchmark for domain adaptation research
+in NLP. It consists of binary sentiment classification datasets from four different
+domains: Books, DVDs, Electronics and Kitchen appliances. Each domain
+dataset contains 1K positive and 1K negative labeled instances. Following
+the convention of previous works [103, 557], we construct 12 balanced crossdomain sentiment analysis tasks, where for each source dataset we randomly
+hold out 400 test instances to evaluate in-domain and always predict on the full
+target dataset. We reserve this dataset for cross-domain experimentation only
+(Section 3.4.5.2).
+
+3.4.3
+
+Architecture
+
+This Subsection motivates the two NLP architectures in scope for the
+experiments.
+TextCNN architecture We use a 1-D Convolutional NN for text classification
+(TextCNN), following the model structure of [218]. We chose this architecture
+for its comparative simplicity and solid out-of-the-box performance on a range
+of text classification tasks. Even as a light-weight model, it can deal with
+feeding in text sequences of varying sizes and learning n-gram-like structures
+over word embeddings, allowing a fair comparison across text datasets. An
+extensive hyperparameter study determined that regularization does not impact
+performance much [537].
+Transformer architecture Models in NLP have become increasingly deeper
+and more complex with the advent of the Transformer architecture [473]. [94]
+have combined multiple bidirectional Transformers with wordpiece tokenization
+and self-supervised pretraining objectives —masked language modeling and
+next sentence prediction— to create the contextual representation modeling
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0097.txt b/assets/txts/pg_0097.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8294eafcdbb99035e82493de49f69922f1fc8ae5
--- /dev/null
+++ b/assets/txts/pg_0097.txt
@@ -0,0 +1,27 @@
+EXPERIMENTAL METHODOLOGY
+
+65
+
+architecture BERT. It allows for fine-tuning on downstream tasks where BERT
+has outperformed task-specific architectures even in low resource settings. In
+our experiments we use BERTbase (uncased, English): 12 layers, 768 hidden
+dimensions, 12 attention heads, with a total number of 110M parameters.
+
+(a) TextCNN
+
+(b) BERT
+
+Figure 3.2. Simplified block-diagrams for each of the NN architectures, demonstrating
+on which layer weights dropout is applied.
+(a) The TextCNN model architecture with 3 kernels (K1 − 3), E word embedding
+dimensionality and F number of feature maps per kernel.
+(b) The BERT model architecture with L Transformers blocks, hidden size H and
+number of self-attention heads A.
+
+Complexity TextCNN comprises only 6M parameters with most parameters
+residing in the embedding matrix. However, it is restricted to a fixed window size
+with the downside of not being able to determine long-distance dependencies in
+text. BERT, on the other hand, has already captured prior language modeling
+knowledge thanks to pretraining. Nevertheless, our experiments already involve
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0098.txt b/assets/txts/pg_0098.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8597058182b794103dbca8d781d485ad088799a2
--- /dev/null
+++ b/assets/txts/pg_0098.txt
@@ -0,0 +1,47 @@
+66
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+significant computational complexity, which is why we decided not to run all
+variations with BERT. TextCNN presents a more controllable configuration,
+achieving decent performance and satisfying for the evaluation of predictive
+uncertainty in text classification. We include an ablation study (Section 3.5.4.2)
+comparing specifically selected models trained with BERT as base architecture.
+
+3.4.4
+
+Evaluation metrics
+
+Since no single metric measures all desirable properties of predictive uncertainty,
+we use a variety of conventional metrics to evaluate our models’ performance,
+(a) calibration metrics, b) proper scoring rules and c) classification scores.
+The metrics are defined in detail in Section 2.2.3, here we will only provide a
+brief description.
+For in-domain evaluation, we use the following metrics: (a) Expected
+Calibration Error (ECE) [156, 332], (b) Brier Score [50] and (b) Negative
+Log Likelihood (NLL) [378]. We use the same metrics for out-of-domain
+evaluation, with the addition of (c) AUROC and (c) AUPR for distribution
+shift detection following [172].
+When evaluating a model trained in a source domain on a target domain with
+a similar task, we denote accuracy in the target domain as OOD accuracy as
+opposed to accuracy in the source domain, which we denote as ID accuracy.
+
+3.4.5
+
+Experimental design
+
+We have determined three logical settings in text classification to evaluate
+predictive uncertainty for each model setup. We present experiments on
+in-domain uncertainty to form baseline results, followed by cross-domain
+classification with a focus on out-of-domain detection, and finally we propose
+novelty detection as a new protocol to evaluate predictive uncertainty.
+While there is no gold standard procedure for comparing multiple (uncertainty)
+methods over multiple (text classification) datasets, we opted for an established
+procedure with statistical testing via multiple comparisons [89, 109]. Since we
+present an exhaustive list of model setups, we present our results in terms of
+rank and critical difference diagrams in order to analyze relative performance
+of each method over different experimental settings.
+Concretely, each dataset concerns independent measurements, for which we
+rank each method, then compare average ranks, and in the event that we can
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0099.txt b/assets/txts/pg_0099.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb45385d2a02724c5db4a2f4cdb1c87167f945f9
--- /dev/null
+++ b/assets/txts/pg_0099.txt
@@ -0,0 +1,47 @@
+EXPERIMENTAL METHODOLOGY
+
+67
+
+reject the null-hypothesis (H0 : all methods have the same rank), we calculate
+post-hoc tests with critical differences over methods. However, only reporting
+ranks does not allow future researchers to compare to our work, which is why
+we include detailed absolute number results in the Appendix C.
+3.4.5.1
+
+In-domain Setting
+
+To evaluate in-domain (ID) uncertainty, we will focus on measuring calibration
+and prediction quality with proper scoring rules (see Section 3.4.4). The ID
+setting assumes that the train and test examples are i.i.d.. To capture all details,
+we compare per task-setting, multi-class and multi-label, and finally zoom in on
+dataset-specific observations. For the in-domain evaluation, we focus on unique
+contributing effects per predictive uncertainty method and the relation between
+method combinations and evaluation metrics.
+• When evaluating with proper scoring rules, does an absolute increase in
+combination size (higher T or M ) correlate with better performance?
+• What effect —equal over all tasks, datasets or architectures— can be
+discerned per unique predictive uncertainty method?
+3.4.5.2
+
+Cross-domain Setting
+
+Since we test over sentiment classification datasets from multiple domains
+(Amazon Product Reviews), we seek to analyze uncertainty reliability across
+domains. However, learned knowledge from a source domain can often transfer
+to classification in the target domain. Provided this setting we need to account
+for cross-domain generalization next to out-of-domain detection, the latter which
+is the focus of our experiments.
+Cross-domain generalization - how well does a classifier trained in a source
+domain perform on a dissimilar target domain sharing a similar task? The aim of
+cross-domain generalization is to learn a robust classifier, which can perform well
+in multiple domains even if there is limited labeled data in some of the domains.
+Domain discrepancy is a major challenge where, for instance, linguistic sentiment
+expressions used in one domain can be different from that of the source domain.
+For example, “garbage disposal” is neutral in kitchen appliances whereas a
+“garbage movie” is strictly negative. This domain discrepancy challenge is often
+approached by adaptation [497, 557] or encouraging domain-agnostic feature
+representations [103, 129]. We propose to test out-of-domain detection with
+predictive uncertainty as a viable fallback strategy when achieving generalization
+over domains is difficult.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0100.txt b/assets/txts/pg_0100.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3ca3f76f0e0f92cf870d3e4f20cfc9c19cbd1a7f
--- /dev/null
+++ b/assets/txts/pg_0100.txt
@@ -0,0 +1,44 @@
+68
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+Out-of-domain detection - how reliably can a classifier trained in a
+source domain communicate uncertainty in a target domain provided good/bad
+generalization? Whenever a model does not generalize to OOD examples,
+we would expect a model to be uncertain, allowing detection in order to
+abstain or trigger conservative fallback strategies [108]. As a proxy to good/bad
+generalization we measure the gap between in-domain and target domain
+accuracy as evidence of train-test skew. We argue that our current setting is
+more realistic than benchmarking OOD detection in totally disparate domains
+such as evaluating a newswire classifier on movie reviews.
+Our analysis will be centered on the following question:
+• How does domain similarity affect out-of-domain detection with uncertainty methods? Is there a clear increase of uncertainty given a higher
+OOD generalization gap?
+3.4.5.3
+
+Novelty Detection Setting
+
+Novelty detection - how well can the model identify and communicate
+uncertainty on samples of a novel class? In the worst case, classifiers “fail
+silently” and wrongly attribute high confidence to an in-distribution class
+[11, 146]. In the best case, the model either lowers its confidence or signals
+uncertainty. Prior work hypothesizes model uncertainty to be mostly impacted
+[213, 250].
+With this experiment we simulate the conditions of novel class data by removing
+a single or multiple classes during training. The resulting distribution shift is
+not too far from the original domain and cannot be considered fully out-ofdistribution (as detailed in Section 3.3.5).
+We determine diverse novelty detection strategies adapted per dataset. For
+20news, we follow [172, 348] and take out all odd-numbered classes to simulate
+novel distribution shift. Since imdb is a sentiment classification dataset, we
+isolate the middle class, rating “5” out of the 10 ratings, from training and
+expect the models to allocate prediction mass to a label close to the holdout
+class (ratings “4” or “6”). CLINC-OOS provides a separate out-of-scope intents
+set on which we assess novel class robustness.
+We devise a new strategy for the multi-label classification datasets, where we
+would isolate a class that is very distinct from the remaining classes, i.e., (i)
+by not appearing often in the originally multi-label annotated dataset jointly
+with the remaining classes, and (ii) occurring frequently enough to guarantee
+representative results. We draw statistics on the label co-occurrence rates of
+each dataset, and find that for Reuters “Acquisitions” (id:0) occurs in 94% of
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0101.txt b/assets/txts/pg_0101.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a17f5a26d0d1fa84917ca485f5da75f598dcd3a9
--- /dev/null
+++ b/assets/txts/pg_0101.txt
@@ -0,0 +1,36 @@
+RESULTS
+
+69
+
+documents as a single topic, making it an ideal candidate for testing novel class
+detection. For AAPD we apply the similar strategy and find the frequent label
+“CS.it” (id:0) to have relatively low label- co-occurrence (2.49), even when there
+are at least 2 labels to be predicted per sample. We isolate all examples where
+the novel class appears, either alone or in combination with other labels.
+We focus our analysis around three specific questions concerning predictive
+uncertainty under distribution shift, and compare generally to other modality
+benchmarks:
+• Do hybrid predictive uncertainty methods incrementally or critically
+improve detection of unseen class instances?
+• Does calibration in the in-domain setting translate to calibration under
+distribution shift?
+• Do we see the same trends as in benchmarks from different modalities
+(Section 3.2)?
+
+3.5
+
+Results
+
+We will present the experimental results in a step-wise manner to avoid confusion
+on the conclusions to be drawn. We start with general and task-specific trends
+observed for the in-domain setting, followed by the distribution shift experiments,
+cross-domain classification and novelty detection. Finally, we present 4 ablation
+studies on critical, learned or empirically set hyperparameter values.
+
+Figure 3.3. In-domain results with critical difference diagram comparing all methods
+by average rank, with the calculated critical difference in the top-left and Friedman
+χ2 p-value top-right. Concrete Dropout Ensemble achieves the highest NLL rank.
+While comparing over 5 datasets, the critical difference is large, with only the two
+aforementioned methods significantly differing from MC Dropout.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0102.txt b/assets/txts/pg_0102.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c160caea739d6724ad6fb0f165cf0f40ba281170
--- /dev/null
+++ b/assets/txts/pg_0102.txt
@@ -0,0 +1,241 @@
+70
+
+3.5.1
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+Experiment: In-domain
+
+Naively combining predictive uncertainty methods will not give any absolute
+performance increase, as proper scoring rules show no correlation (-0.01) with
+the absolute number of predictive uncertainty methods combined. This requires
+deeper analysis to identify which singular or hybrid methods do significantly
+outperform baselines.
+First, we visualize general results with critical difference diagrams comparing
+all methods by average ranking over datasets (Fig. 3.3). Critical difference (CD)
+can be interpreted as the smallest difference between methods which is likely
+to indicate a significant improvement. In short, the null hypothesis —there
+is a significant difference between the methods— cannot be rejected for all
+methods connected by a dark bar. We also report Friedman χ2 , which is a
+non-parametric statistical test that considers ranking methods over different
+attempts, in our case datasets, requiring a minimum of 3 methods in comparison.
+This test checks whether the measured average ranks are significantly different
+from the mean rank that is expected under the null-hypothesis.
+Table 3.3. In-domain (left) combined Brier and NLL proper scoring rule pairwise
+comparison counts of wins/draws/losses and (right) ECE metric reported for comparing
+in-domain calibration. For in-domain predictive accuracy, ensembles clearly are
+superior. Considering only miscalibration, Concrete Dropout generally adds calibration
+to predicted probabilities. The combination with MC Dropout gives unpredictable
+ranking results.
+ref
+9
+12
+16
+15
+17
+11
+13
+10
+14
+0
+5
+7
+8
+6
+4
+2
+1
+3
+
+Deep Ensemble
+Concrete Dropout Ensemble
+Heteroscedastic Concrete Dropout Ensemble
+MC Heteroscedastic Ensemble
+MC Heteroscedastic Concrete Dropout Ensemble
+MC Ensemble
+MC Concrete Dropout Ensemble
+Deep Ensemble Regularized
+Heteroscedastic Ensemble
+Unregularized
+Concrete Dropout
+Heteroscedastic Concrete Dropout
+MC Heteroscedastic Concrete Dropout
+MC Concrete Dropout
+MC Heteroscedastic
+MC Dropout
+Regularized
+Heteroscedastic
+
+wins
+
+draws
+
+losses
+
+142
+135
+130
+114
+114
+111
+102
+90
+82
+79
+77
+70
+65
+58
+40
+39
+34
+30
+
+0
+1
+4
+2
+2
+3
+0
+1
+2
+4
+1
+3
+2
+0
+5
+6
+0
+0
+
+28
+34
+36
+54
+54
+56
+68
+79
+86
+87
+92
+97
+103
+112
+125
+125
+136
+140
+
+ref
+5
+12
+4
+8
+2
+15
+16
+7
+9
+0
+6
+11
+17
+1
+3
+14
+10
+13
+
+Concrete Dropout
+Concrete Dropout Ensemble
+MC Heteroscedastic
+MC Heteroscedastic Concrete Dropout
+MC Dropout
+MC Heteroscedastic Ensemble
+Heteroscedastic Concrete Dropout Ensemble
+Heteroscedastic Concrete Dropout
+Deep Ensemble
+Unregularized
+MC Concrete Dropout
+MC Ensemble
+MC Heteroscedastic Concrete Dropout Ensemble
+Regularized
+Heteroscedastic
+Heteroscedastic Ensemble
+Deep Ensemble Regularized
+MC Concrete Dropout Ensemble
+
+wins
+
+draws
+
+losses
+
+68
+58
+52
+52
+49
+48
+48
+46
+45
+40
+40
+38
+37
+32
+29
+27
+24
+23
+
+1
+1
+1
+0
+2
+1
+0
+0
+1
+2
+0
+2
+1
+0
+2
+2
+2
+0
+
+16
+26
+32
+33
+34
+36
+37
+39
+39
+43
+45
+45
+47
+53
+54
+56
+59
+62
+
+Table 3.3 shows more detailed pairwise comparison scores, demonstrating that if
+both proper scoring rules are considered, plain ensembles and hybrid methods
+based on deep ensembles are overall superior to single model uncertainty
+prediction methods. However, the benefit resides more in accuracy than
+calibration, where some single model predictive uncertainty methods rank
+higher, specifically Concrete Dropout.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0103.txt b/assets/txts/pg_0103.txt
new file mode 100644
index 0000000000000000000000000000000000000000..836e7c5bad5ee388379791d1bed12567514a28f2
--- /dev/null
+++ b/assets/txts/pg_0103.txt
@@ -0,0 +1,43 @@
+RESULTS
+
+71
+
+For a most complete answer to unique effects per predictive uncertainty method,
+we need to analyze dataset-specific results. Detailed results per dataset and
+metrics (Appendix C.1 Fig. A.1) reconfirm that a method’s superiority (i.e.,
+for the whole application domain of in-domain text classification) should not
+be concluded based on 1 single dataset. Each dataset has specific problem
+characteristics, which affect method ranking differently at varying magnitudes.
+However, the comparative performance of each method is not fully dependent
+on the dataset tested, with Deep Ensemble performing reliably in-domain as
+evidenced by rank.
+
+3.5.2
+
+Experiment: Cross-domain
+
+This Subsection is dedicated to analyzing predictive uncertainty methods under
+domain shift. We first present results on cross-domain generalization, followed
+by a challenging OOD detection setting. Finally, we draw parallels between
+both settings’ experimental results.
+We conduct extensive experiments on the benchmark Amazon product review
+datasets on a total of 12 source-target domain configurations. Each domain
+is abbreviated by its first uppercase letter: (B)ooks, (D)VD, (E)lectronics,
+(K)itchen. Fig. 3.4 reports on the lowest cross-domain generalization gap
+between ID and OOD domain datasets. We observe higher ID accuracy for
+Kitchen and Electronics, which can indicate a relatively lower complexity of
+domain sentiment. Importantly, the gap between Kitchen - Electronics and
+Books - DVD are smallest overall, coinciding with our intuitions on domain
+similarity. Remarkably, regularized Deep Ensemble trained on Book reviews
+even scores higher accuracy (+1.8%) on its target domain (B−
+→D).
+
+Figure 3.4. Lowest accuracy generalization gap, in-domain (Acc_ID) minus out of
+domain (Acc_OOD) accuracy (y-axis), of all predictive uncertainty methods per
+source−
+→target domain combination (x-axis).
+
+To analyze the cross-domain performance of predictive uncertainty methods we
+report average rank ID NLL and OOD accuracy (Fig. 3.5). Heteroscedastic
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0104.txt b/assets/txts/pg_0104.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7eaaaad716a7af0baa13398af86f83cdbf881091
--- /dev/null
+++ b/assets/txts/pg_0104.txt
@@ -0,0 +1,35 @@
+72
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+ID NLL
+
+OOD Accuracy
+
+Figure 3.5. Average rank of in-domain NLL for the 4 source datasets (left) and out-ofdomain accuracy over 12 source-target configurations (right) for all tested predictive
+uncertainty methods.
+
+Concrete Dropout Ensemble ranks highest in-domain when evaluated with a
+proper scoring rule. Models without any regularization achieve higher OOD
+accuracy scores, with Deep Ensemble significantly outperforming more than half
+of the predictive uncertainty methods (first black bar). A possible explanation
+could be that most target domain data is more similar to the source domain than
+expected, effectively giving an edge to methods that achieve high ID accuracy.
+To evaluate Out-of-domain detection, we report AUROC ranks in Fig. 3.6
+and additionally plot OOD detection over generalization scores in Fig. 3.7.
+Concrete Dropout Ensemble and variations outrank other methods on OOD
+detection. Nevertheless, we must nuance the ranking results since the magnitude
+of AUROC is generally low, close to random (50-54%) with no class imbalance,
+over all 12 cross-domain settings. These results might indicate that from the
+perspective of the methods tested, there are no salient differences between the
+different domains. More specifically, Books and DVD as a source have AUROC
+scores on target OOD domain data centered around 51% and Kitchen and
+Electronics as a source have comparable AUROC scores with 1 higher AUROC
+(54%) cluster for OOD Books and DVD targets.
+
+Figure 3.6. Average rank of OOD AUROC over 12 cross-domain settings for predictive
+uncertainty methods.
+
+Additionally, Fig. A.2 in Appendix C.1 demonstrates a similarly clear difference
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0105.txt b/assets/txts/pg_0105.txt
new file mode 100644
index 0000000000000000000000000000000000000000..359a48721de4675da6a5ea7a51157c43780d72a8
--- /dev/null
+++ b/assets/txts/pg_0105.txt
@@ -0,0 +1,35 @@
+RESULTS
+
+73
+
+Figure 3.7. AUROC detection magnitude (y-axis) mapped over OOD accuracy (x-axis)
+with a legend on the right for methods that support uncertainty estimation.
+
+in correlation effect size of uncertainty quantities with ID-OOD data depending
+on the target domain, e.g., high overall mean correlation (0.3) for Kitchen source
+evaluated on the disparate domain of Books, whereas uncertainty correlation
+on Electronics averages around 0.1 for the most correlated quantities.
+
+3.5.3
+
+Experiment: Novelty Detection
+
+Before analyzing which predictive uncertainty methods provide better detection
+of instances of an unseen class, we report on how uncertainty metrics (cf.
+Section 3.3.3) correlate with novel class data.
+In Fig. 3.8 the final rank over datasets confirms the superior robustness of
+predictive entropy as an uncertainty metric. Logically, it is closely followed by
+maximum softmax score. Next, model uncertainty correlates generally well with
+novel class data. Interestingly, model uncertainty outperforms entropy on AAPD,
+with most methods showing the need for learning from more data to better
+approximate the model parameters.
+Similarly to the evaluation of in-domain performance, we use CD diagrams (Fig.
+3.10) with binary detection metrics AUPR and AUROC to provide a ranking
+of predictive uncertainty methods over datasets.
+The absolute pairwise comparisons (Table 3.9) confirm that hybrid predictive
+uncertainty methods improve detection of novel class data. Quite surprisingly,
+Deep Ensemble which ranked absolute highest for in-domain, drops multiple
+ranks in favour of combination ensembles (Heteroscedastic Ensemble or even
+MC Concrete Dropout). The in-domain calibration effect from Concrete Dropout
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0106.txt b/assets/txts/pg_0106.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9f9ab5943df46a254ba4f8bd378c8ac4beee8dc0
--- /dev/null
+++ b/assets/txts/pg_0106.txt
@@ -0,0 +1,37 @@
+74
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+(a) 20news
+
+(b) CLINC150
+
+(d) Reuters
+
+(e) AAPD
+
+(c) imdb
+
+Figure 3.8. We report the Pearson Correlation Coefficient (PCC) between uncertainty
+values and binary variable ID-OOD for 5 benchmark datasets. Higher absolute
+correlation score points to stronger association of uncertainty and novelty detection.
+*Model Uncertainty (MU), Data Uncertainty (DU), Mutual Information (MI).
+
+appears to pass over to this novelty detection setting. More importantly, it also
+helps boost the novelty detection performance of Deep Ensembles when jointly
+used (e.g., MC Concrete Dropout Ensemble).
+While comparing over 5 datasets, there is no critical difference between the
+average ranking of methods, which can point to task or dataset-specific
+interactions. Fig. 3.11 shows the variation of AUROC performance for the
+different methods, from which we can observe that (non-finetuned) dropout
+sampling (MC Dropout) under-performs in most datasets, most clearly on AAPD,
+by severly underestimating uncertainty on samples of a novel class. We also
+observe relative benefits of the Heteroscedastic loss function for multi-class text
+classification, which most clearly is represented in the CLINC150 results. The
+same visualization allows us to evaluate the quality of uncertainty quantification
+for each method. Generally, epistemic uncertainty derived from ensembles
+offers higher quality detection of novel class data than single model predictive
+uncertainty. This effect is clearly visible for multi-class classification where the
+ensembles clearly group on top, as opposed to the results for the multi-label
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0107.txt b/assets/txts/pg_0107.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d4c0b827b135bd4b58a592895758a0e51e30f3fa
--- /dev/null
+++ b/assets/txts/pg_0107.txt
@@ -0,0 +1,108 @@
+RESULTS
+
+75
+
+ref
+MC Concrete Dropout Ensemble
+Heteroscedastic Ensemble
+MC Concrete Dropout
+MC Heteroscedastic Ensemble
+Deep Ensemble Regularized
+Concrete Dropout
+MC Heteroscedastic Concrete Dropout Ensemble
+MC Heteroscedastic Concrete Dropout
+Concrete Dropout Ensemble
+Regularized
+Heteroscedastic
+Deep Ensemble
+Heteroscedastic Concrete Dropout Ensemble
+MC Heteroscedastic
+MC Ensemble
+Unregularized
+Heteroscedastic Concrete Dropout
+MC Dropout
+
+wins
+
+draws
+
+losses
+
+121
+119
+109
+102
+100
+90
+89
+86
+83
+81
+80
+80
+75
+75
+71
+69
+47
+46
+
+1
+1
+1
+0
+0
+1
+2
+1
+0
+1
+0
+0
+2
+0
+2
+0
+1
+1
+
+48
+50
+60
+68
+70
+79
+79
+83
+87
+88
+90
+90
+93
+95
+97
+101
+122
+123
+
+Figure 3.9. Novelty detection AUROC and AUPR pairwise comparison counts of
+wins/draws/losses.
+
+Figure 3.10. Novelty detection CD diagram of AUROC.
+
+datasets.
+Additionally, we visually detail in Appendix C.1 Fig. A.3 density estimates for
+uncertainty quantities with respect to in-domain versus novel data with most
+hybrid ensemble methods demonstrating better separable densities.
+
+3.5.4
+
+Experiment: Ablations
+
+In this Subsection, we zoom in on the best performing uncertainty prediction
+methods relative to the complementary benefits hypothesized for hybrid
+approaches (Section 3.4.1), provide explanations for results specific to an
+architecture (TextCNN vs. BERT, Section 3.4.3), and present ablations on
+critical hyperparameters.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0108.txt b/assets/txts/pg_0108.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3c81b207423eb47ec1743a5b7d6d5092b6885d8a
--- /dev/null
+++ b/assets/txts/pg_0108.txt
@@ -0,0 +1,32 @@
+76
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+(a) AUROC
+
+(b) Epistemic uncertainty
+
+Figure 3.11. Comparison with AUROC(↑) and Epistemic uncertainty PCC(↑) for task
+and dataset-specific differences in novel class detection. Methods with 0 correlation
+do not support model uncertainty quantification.
+
+3.5.4.1
+
+Diversity
+
+Diversity of samples drawn from a posterior, either via T MC samples and/or
+M ensemble components, is an important condition for efficient uncertainty
+estimation. If each sample presents a similar function, the overall prediction
+can be overconfident, and increasingly drawing samples will not reduce this.
+We derive a small experimental setting from [118] to measure function-space
+diversity for all predictive uncertainty methods involving posterior sampling.
+In Fig. 3.12 we analyze the relation between accuracy and diversity as measured
+by Kullback-Leibler
+divergence between a sampled prediction and the predictive
+PT
+mean, T1 t=1 KL(p(y ∗ |x∗ , θ̂t )||p̄(y ∗ |x∗ , θ̂)). For a fair comparison, we calculate
+diversity at the ensemble level if a predictive uncertainty method consists of
+multiple models, else at the dropout sample level.
+While the diversity-accuracy plane does not provide a one-on-one linear
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0109.txt b/assets/txts/pg_0109.txt
new file mode 100644
index 0000000000000000000000000000000000000000..60affdc236c78b2bac1b5d15090544c519af81a4
--- /dev/null
+++ b/assets/txts/pg_0109.txt
@@ -0,0 +1,37 @@
+RESULTS
+
+77
+
+(a) 20news
+
+(d) Reuters
+
+(b) CLINC150
+
+(c) imdb
+
+(e) AAPD
+
+Figure 3.12. Detailed accuracy scores mapped over diversity measured by average KL
+divergence for each of the benchmark datasets.
+
+relationship, we note in Fig. 3.12 (a,b,d) promising results for hybrid ensemble
+methods, which with higher diversity improve on accuracy over Deep Ensemble.
+The visual of imdb (c) registers overall low diversity, even for simple predictive
+uncertainty methods which generally achieve higher diversity, albeit by capturing
+multiple dissimilar yet weaker functions. For AAPD (e), most methods are tied
+for exact accuracy even with different diversities.
+3.5.4.2
+
+NLP Architecture
+
+We selected specific representative predictive uncertainty methods on the
+basis of our previous experiments to run with the Transformer BERT as
+base architecture. We argue that the chosen architecture can have a nonnegligible impact on uncertainty estimation, and we compare with the simple
+yet controllable TextCNN architecture in order to investigate whether the same
+conclusions hold for novelty detection.
+The separate Out-of-Scope set of CLINC150 allows us to easily evaluate novelty
+detection with BERT. We observe in Fig. 3.14 on CLINC150 that BERT does
+increase novelty detection over all metrics. Even without any hyperparameter
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0110.txt b/assets/txts/pg_0110.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0f25806344ef7e72d75921b910c8e78e12f669bd
--- /dev/null
+++ b/assets/txts/pg_0110.txt
@@ -0,0 +1,31 @@
+78
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+(a) ID Accuracy - imdb
+
+(b) ID Accuracy - Reuters
+
+(c) AUROC - 20news
+
+(d) AUROC - Reuters
+
+(e) Epistemics - 20news
+
+(f) Epistemics - imdb
+
+Figure 3.13. Novelty detection scores mapped per architecture for the benchmark
+datasets without dedicated OOD split. The legend of Fig. 3.11 applies here.
+
+Figure 3.14. Detailed AUROC-epistemics (PCC) scores mapped per architecture on
+CLINC150. Best performance: upper-right corner. The legend of Fig. 3.11 applies
+here.
+
+tuning Unregularized BERT outperforms all TextCNN models. Overall, we
+register the same ranking of predictive uncertainty methods, albeit a Deep
+Ensemble with BERT is superior to hybrid ensembles. Crucially, we note that
+the correlation of epistemic uncertainty with novelty detection is higher for each
+TextCNN ensemble than for every single BERT model.
+Most notably, results on all other datasets are inconsistent with the above. For
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0111.txt b/assets/txts/pg_0111.txt
new file mode 100644
index 0000000000000000000000000000000000000000..607996c7119d3637f8147dbe4dfe0ace81b3c8de
--- /dev/null
+++ b/assets/txts/pg_0111.txt
@@ -0,0 +1,38 @@
+RESULTS
+
+79
+
+comparison, we have trained an informed sub-selection of predictive uncertainty
+methods with BERT as base architecture (Fig. 3.13).
+Generally, we observe in (a,b) higher ID accuracy for BERT with relatively
+slighter gains when ensembling. AUROC scores (c,d) are well below even single
+TextCNN models, pointing to a crucial deficiency with BERT in a novelty
+detection setting. The correlation of epistemic uncertainty with novel class
+samples draws a similar picture (e,f). MC Heteroscedastic Concrete Dropout
+Ensemble on imdb does produce more correlated epistemic uncertainty than all
+other methods.
+
+(a) 20news - MI
+
+(d) Reuters - MU
+
+(b) CLINC150 - AUROC
+
+(c) imdb - MU
+
+(e) AAPD - H
+
+Figure 3.15. Visualization of representative dataset-quantity/metric combinations
+mapped over stepwise increasing ensemble size M . Note that positive and negative
+correlations are corollary to the quantity reported. Given the small relative differences,
+plots are best viewed online.
+
+3.5.4.3
+
+Ensemble size M
+
+Combining models to an ensemble generally benefits performance both in and
+out-of-domain. Previous research [118, 238] worked out that ensembling benefits
+stagnate with larger model size M . Fig. 3.15 selectively reports novelty detection
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0112.txt b/assets/txts/pg_0112.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3d2a5760cc843cb4f8a49d17efcf0f0cac203555
--- /dev/null
+++ b/assets/txts/pg_0112.txt
@@ -0,0 +1,33 @@
+80
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+metrics or uncertainty correlation scores for all ensemble-based methods of
+different sizes.
+AUROC score for CLINC150 (3.15b) is a representative example of the expected
+effect of ensembling. Importantly, it provides crucial evidence for our general
+hypothesis, demonstrating that ensembling over predictive uncertainty methods
+gives complementary benefits in novelty detection settings. What is similarly
+interesting is that the relative benefit of ensembling shows slightly different
+curves in certain cases. Epistemic uncertainty for imdb (3.15c) already attains
+similar performance at M=2, again showing comparatively slower (since less
+required) increase at larger M for hybrid ensembles. AAPD (3.15e) shows more
+stagnant behavior for the reliability of entropy with growing ensemble size,
+irrespective of the predictive uncertainty method.
+3.5.4.4
+
+Concrete Dropout p
+
+Figure 3.16. Learned layer-wise dropout probability per layer for each method with
+Concrete Dropout. The first 3 layers are the CNN kernels (K1 − 3), followed by the
+penultimate layer µ, possibly with σ for modeling heteroscedasticity. The legend of
+Fig. 3.17 applies here.
+
+Fig. 3.17 relays an important observation on the dataset-wise adaptation of
+Concrete Dropout: increasing the learned dropout rate as is required for the
+problem at hand. This reinforces the argument against fixed-rate dropout.
+[125] remarked that practitioners started to adopt the strategy of fine-tuning
+dropout with a bottleneck pattern, i.e., start with a higher dropout rate in early
+layers and decrease the deeper you go in the network. Our results (Fig. 3.16)
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0113.txt b/assets/txts/pg_0113.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb4e7a903c715650908e86bcbe9bb23048b0d38c
--- /dev/null
+++ b/assets/txts/pg_0113.txt
@@ -0,0 +1,33 @@
+DISCUSSION
+
+81
+
+shows discrepancy with this practice, specifically for 20news and CLINC150. We
+do note that both converged to low dropout rates, which can provide the basis
+for this differing behavior.
+
+Figure 3.17. Top: Average epoch of convergence per dataset. Bottom: Average learned
+Concrete Dropout probability per dataset over predictive uncertainty methods. We
+observe very dataset-dependent dropout rates.
+
+3.6
+
+Discussion
+
+Our study investigates both scalable and hybrid procedures for incorporating
+uncertainty into DL models for text classification. Next to baseline in-domain
+uncertainty evaluation, we have designed two experimental settings, novelty
+detection and cross-domain classification, to analyze the reliability of uncertainty.
+Additionally, we devised ablation studies to analyze important hyperparameters
+in connection to our three hypotheses (Section 3.4.1) on complementary benefits
+for hybrid uncertainty prediction methods.
+Benchmarking uncertainty methods We summarize our findings succinctly and discuss the results of each experimental setting.
+We find that individually (> indicating “outperforms” over all experiment
+settings):
+Deep Ensemble > Concrete Dropout > (MC) Heteroscedastic ≥ MC Dropout
+We find that jointly, by considering method combinations:
+(MC) Concrete Dropout Ensemble ≥ (MC) Heteroscedastic Ensemble >
+MC Concrete Dropout > Deep Ensemble > Deep Ensemble Regularized >
+MC Dropout
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0114.txt b/assets/txts/pg_0114.txt
new file mode 100644
index 0000000000000000000000000000000000000000..055fb52976534f236915c90070cafc9f3802df78
--- /dev/null
+++ b/assets/txts/pg_0114.txt
@@ -0,0 +1,42 @@
+82
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+In-domain results (Section 3.5.1) corroborate the superiority of Deep Ensemble
+with high accuracy and proper scores (NLL, Brier). Table 3.3 demonstrates
+that the improvements come from accuracy as opposed to calibration, where
+Concrete Dropout-based methods rule.
+Cross-domain experiments (Section 3.5.2) give differing conclusions: crossdomain generalization results are similar to in-domain, whereas out-of-domain
+detection follows novelty detection results. Our evaluation of uncertainty
+quantities (Fig. A.2) demonstrate reliably higher correlation of uncertainty
+with domain discrepancy. We do take note of relatively low magnitude AUROC
+(Fig. 3.6), which underlines how challenging out-of-domain detection is in a
+domain adaptation setting with comparably similar linguistic patterns.
+Novelty detection (Section 3.5.3) in text classification gives reverse results:
+Hybrid ensemble methods with Concrete Dropout rank highest scored by
+AUROC, AUPR and model uncertainty correlation, followed by other method
+combinations that induce calibration. We do note that specific method
+performance is often tied to task and dataset characteristics, with results
+averaged over the 5 benchmark sets showing statistically non-significant
+differences between methods. As shown in Table 3.9, standard Deep Ensemble,
+i.e., without any regularization or prior from combining methods, perform worse
+outside the in-domain setting. The case for standard MC Dropout is even worse
+with novel class robustness (AUROC and AUPR) lower than the Unregularized
+point-estimate model.
+Remarkably, BERT performs worse than the simpler TextCNN model at
+detecting distribution shift in the form of novel class data (Fig. 3.14). Results on
+the OOS set of CLINC150 differ from results obtained on all other datasets, which
+we believe can be attributed to the short, in-domain intent commands differing
+strongly in vocabulary with the OOS samples, resulting in a comparatively less
+challenging novelty detection setting. We contend that novelty detection is
+actually more challenging for BERT despite of its pretrained language modeling
+knowledge and because of the strict requirement to fine-tune the task-specific
+final layer with new supervision. Its ability to detect (and overly rely on, e.g.,
+[162]) statistically relevant yet possibly spurious cues in language data will make
+it overconfident with transfer to a new task when the i.i.d. assumption cannot
+be maintained.
+Validating hybrid approaches We have empirically analyzed individualjoint effectiveness in modeling predictive uncertainty and will answer our three
+hypotheses on complementary benefits from combining inter and intra-modal
+posterior approximation.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0115.txt b/assets/txts/pg_0115.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1440627302f4a2dcfd5000a5774b80b175cfe38a
--- /dev/null
+++ b/assets/txts/pg_0115.txt
@@ -0,0 +1,46 @@
+DISCUSSION
+
+83
+
+Firstly [A], ensembling (increasing M ) proves to give relatively higher
+performance benefits than stochastically sampling predictions from an optimized
+solution (T ). The effect is clearest in the in-domain setting (Table. 3.3)
+and is less pronounced in the out-of-domain settings. For a given predictive
+uncertainty method, we cannot provide solid evidence that uncertainty reliability
+always improves when subspace sampling (increasing T , “MC”). AUROC and
+AUPR rankings (Figs. 3.10 and 3.6) present evidence in favour, although Fig.
+3.11 depicts a more fine-grained comparison over datasets and uncertainty
+methods. Our analysis of diversity (Fig. 3.12) shows promising results for
+hybrid ensemble methods, which exhibit higher diversity in posterior samples
+resulting in improved accuracy.
+Secondly [B], our newly proposed hybrid uncertainty estimation methods
+improve effectively over singular methods, both in novelty detection (Table
+3.9 and Figs. 3.10, 3.11) and out-of-domain detection (Fig. 3.6). Additionally,
+in ablation studies we find (Fig. 3.15) that combining predictive uncertainty
+methods in an ensemble attains higher performance with a lower number of
+models (M < 5) compared to a Deep Ensemble (M = 5).
+Thirdly [C], Table 3.3 demonstrates that MC Concrete Dropout improves over
+MC Dropout (p=0.5) on ECE and proper scoring functions. The out-of-domain
+experiments (detail: Fig. 3.11) similarly show that not fine-tuning dropout
+to the dataset and task at hand is detrimental even when combining models
+into an ensemble (e.g., MC Ensemble vs. MC Concrete Dropout Ensemble).
+Ablation on Concrete Dropout (Fig. 3.17) points to very dataset-dependent
+learned probability rates, which vary strongly layer-wise (Fig. 3.16). We
+link the empirical superiority of MC Concrete Dropout Ensemble to balanced
+posterior collapse, thanks to the VI-based optimization of the dropout prior.
+We tentatively claim that the former provides constrained hypothesis support
+and a more fine-tuned influence of prior.
+Benchmark comparison When comparing our results to existing BDL
+benchmarks, most observations are consistent for in-domain and out-of-domain
+performance.
+Our in-domain results are most similar to [348], where Deep Ensemble
+outperforms most methods, —albeit in their survey they did not compare
+combinations of predictive uncertainty—, in our benchmark closely followed
+by hybrid ensemble methods. When evaluating over various data retention
+rates [113] observed that “an ensemble of MC Dropout models” (our MC
+Ensemble) consistently outperforms all other methods. This survey offers the
+closest point of comparison, although our experimental settings vary. While
+we cannot directly compare cross-domain detection with other benchmarks, we
+argue that our cross-domain classification setting mimics their low data regime
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0116.txt b/assets/txts/pg_0116.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1e2c939bd0eb6bdd2650d0377a96ed199e1afeae
--- /dev/null
+++ b/assets/txts/pg_0116.txt
@@ -0,0 +1,44 @@
+84
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+experiments.
+Across different modalities and tasks, Deep Ensemble has been reported to
+consistently outperform VI-based methods, most specifically MC Dropout,
+with/without distribution shift (image classification [348], molecule prediction
+[409], and pendulum physics [56]). However, for a binary image classification
+problem, [113] report higher accuracy for MC Dropout compared to Deep
+Ensemble, whereas our results suggest that MC Dropout can induce positive
+calibration, yet score lower on accuracy and with proper scoring rules. In
+their experiments they use a fixed dropout rate of 0.2 and fine-tuned weight
+decay rate, making them fitting for their task at hand and explaining possibly
+optimistic results. Another uncertainty quantification benchmark [462] reports
+strong results on image classification for various Monte Carlo methods, although
+we cannot make a direct comparison. For further discussion, we refer the reader
+to Appendix 3.7.1.
+Our results suggest that BERT performs worse in a novelty detection setting,
+whereas [174] concludes that Transformers are considerably more robust when
+compared across domains, e.g., detection of news samples with a sentiment
+classifier. We point out below that both settings are in fact incomparable. We
+evaluate detection on novel samples which have alike vocabulary characteristics
+to the source domain albeit they are excluded from training supervision. Their
+setting evaluates detection between very disparate domains where linguistic
+patterns are significantly different and BERT will most probably fallback to
+its pretrained knowledge for detection. In short, we do believe that pretrained
+Transformers could perform better under varying distribution shifts, yet with
+our results underpinning the exception of novel class detection. More research
+is needed into how the inductive bias from given NN architectures influences
+approximate inference.
+Take-homes For predictive uncertainty in text classification, we derive a
+number of take-homes from the benchmarking evidence, centered around
+practical facets to consider for applications.
+One has to consider (i) ease and cost of implementation, (ii) computational
+and memory complexity, comprising training compute, test compute and
+storage/memory constraints, (iii) the degree of fine-tuning required, (iv) type
+of supervision; multi-class with low/high number of classes (K) or multi-label
+with low/high cardinality (C), (v) expectation of distribution shift; in the form
+of novel class data or unseen language patterns, and (vi) support for uncertainty
+quantification by source.
+For a prototypical low K multi-class text classification task, we advise Deep
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0117.txt b/assets/txts/pg_0117.txt
new file mode 100644
index 0000000000000000000000000000000000000000..92b8aefe4228a82875da7de2b5d024f51d725e5e
--- /dev/null
+++ b/assets/txts/pg_0117.txt
@@ -0,0 +1,46 @@
+ADDITIONAL UNCERTAINTY APPROACHES
+
+85
+
+Ensemble for solid in-domain performance and adequate distribution shift
+robustness. In the case of memory or storage constraints, for example if your
+base model already has high complexity, using (MC) Concrete Dropout will
+provide calibration benefits both in and out-of-domain, albeit at a slightly larger
+implementation cost. Similarly, to constrain computational complexity, it can
+be more sensible to rely on a TextCNN ensemble (5*6M parameters) rather
+than BERT (110M parameters). Considering time complexity, we have added
+detailed compute, time and storage statistics for evaluated methods (Appendix
+Appendix B.2). We would advise against using MC Dropout if the dropout
+rate and weight regularization are not fine-tuned for the problem at hand. Our
+benchmarking experiments demonstrate the unpredictable behavior of fixed-rate
+MC Dropout, compared to Concrete Dropout, which we used as a proxy for
+models with fine-tuned dropout ratio. This (mal)practice should be highlighted
+as it has substantial impact on uncertainty estimation and robustness.
+If K starts to increase, it warrants the effort to implement the Heteroscedastic
+loss function, which will make the model more calibrated in-domain. Additionally, it enables data uncertainty estimation for possible noisy ground truths,
+which can happen more frequently with a larger number of classes.
+If C grows larger, reliable epistemic uncertainty estimation becomes more
+important, since the problem is made more complex given the larger number of
+label combinations. Our evidence is slightly contradicting, with results obtained
+on Reuters suggesting MC Concrete Dropout Ensemble and on AAPD warranting
+Deep Ensemble. What should be clear, is that any form of ensembling is valuable
+in multi-label classification to boost performance.
+Under the expectation of distribution shift in the form of novel class data,
+adding Concrete Dropout with stochastic sampling to an ensemble, MC Concrete
+Dropout Ensemble, gives relatively strong benefits compared to a regular Deep
+Ensemble. Ablations also show that less models (M ) would be required to
+reach similar performance. Generally, in-domain calibration inducing methods
+are more robust when applied in the tested out-of-domain settings. For the
+in-domain setting, the incorporation of data uncertainty incrementally improves
+multi-class text classification. Ablation on NLP architectures (Section 3.5.4.2)
+points to a deficiency of BERT for detecting novel class data and would similarly
+be advised against in favour of simpler text classification architectures.
+
+3.7
+
+Additional Uncertainty Approaches
+
+Next to the method combinations benchmarked in the main work, we
+acknowledge two alternative approaches to uncertainty estimation with appealing
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0118.txt b/assets/txts/pg_0118.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c97416ae2a0f9c11e5b9c517674bee1a1015cda3
--- /dev/null
+++ b/assets/txts/pg_0118.txt
@@ -0,0 +1,46 @@
+86
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+properties such as training scalability and cheaper inference.
+
+3.7.1
+
+Stochastic Gradient MCMC Methods
+
+There exists a wide range of sampling-based inference methods in the stochastic
+gradient MCMC (SG-MCMC) literature, which have become increasingly more
+tractable and empirically successful for uncertainty estimation. Specifically,
+we re-implemented an exemplary approach [530], cyclical SG-MCMC (cSGMCMC), which uses a cosine cyclical learning rate schedule [292] to (i) better
+explore the highly multimodal loss landscape and (ii) sample more efficiently
+from the posterior. While this appealing approach reduces computational
+complexity by only training a single model, we experienced that it is very tricky
+to finetune with many hyperparameters interplaying. Instead of benchmarking
+these methods and reporting scores over ranges of hyperparameters, we provide
+a discussion of the perceived gap in theory and practice for this family of
+uncertainty methods.
+While the stochastic MCMC setting, estimating parameter updates from
+minibatches, is computationally convenient, it induces several theoretical
+challenges: i) minibatch noise introduced from small subsets of data [297],
+ii) omission of the Metropolis-Hastings correction step provides fundamentally
+biased estimates of posterior expectations [192], and iii) the suggested practice of
+temperature tempering implies an approximation to the exact posterior instead
+of proper convergence [122, 491].
+Closer to practice, [530]’s methods have been successfully benchmarked [462, 491]
+with reported performance on OOD detection for image classification datasets
+comparable to or better than Deep Ensembles. An important caveat is that all
+hyperparameters have been meticulously finetuned to the task at hand. This is
+non-trivial given the additional specification of the number of cycles as guided
+by a training budget, proportion of burn-in steps, and finding an appropriately
+tempered posterior. The original work [530] mentions little dependence of results
+on these modifications to the optimization procedure, yet we observed similar
+to [122] “the complexity and fragility of hyper-parameter tuning, including the
+learning rate schedule and those that govern the simulation of a second-order
+Langevin dynamics”. Additionally, making combinations of uncertainty methods
+with cSG-MCMC is non-trivial, since regularization in any form influences the
+large scale curvature of the regions the optimizer explores.
+With regards to re-implementation, we experienced issues with the indexing
+of sparse gradient updates for the embedding lookup, an operation pervasive
+in NLP architectures. Our original baseline models were trained with Adam
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0119.txt b/assets/txts/pg_0119.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3e77edfcf590ffc35651e001106fe28ae2857467
--- /dev/null
+++ b/assets/txts/pg_0119.txt
@@ -0,0 +1,47 @@
+ADDITIONAL UNCERTAINTY APPROACHES
+
+87
+
+optimizer, which consistently outscored any of our cSG-MCMC experiments
+built upon SGD modifications.
+There is an unmistakable complexity with how to sample appropriately from the
+true posterior, as we now rely much on the training data, a “weak” regularizer,
+on how to add noise for parameter space exploration. Concurrently, the
+overparametrized regime is becoming commonplace in DL, especially in NLP
+with the advent of Transformers, which calls for more sensible priors for more
+than millions of parameters [453] and a better understanding of how output
+functions are affected [107]. We believe stronger priors are available, not only
+over parameters P (θ) but rather over functions P (fθ (x)) as specified by the
+choice of architecture [192], which can make this family of methods an even
+more competitive challenger.
+
+3.7.2
+
+Spectral-normalized Neural Gaussian Process
+
+[283] propose with Spectral-normalized Neural Gaussian Process (SNGP) a
+principled, scalable approach to uncertainty estimation for deep NNs. They
+promote “distance awareness” as a necessary condition, which they accomplish
+via spectral weight normalization and a GP output layer. Thanks to the
+mean-field approximation [295] only a single forward pass suffices without MC
+sampling to estimate the predictive distribution. Empirically, SNGP was shown
+to outperform Deep Ensemble by some margin on OOD detection for both
+image and text data. By demonstrating the relative importance of the decision
+boundary of a single model fθ (y|x) versus averaging over multiple models, we
+are inspired to analyze the combination of SNGP with alternate uncertainty
+methods.
+We have re-implemented SNGP using components of edward2 [454], Laplace
+approximation, random feature GP and spectral normalization. In our
+experience, the most crucial hyperparameters to finetune were the number
+of inducing points (ι ≤ 1024) and spectral norm multiplier s. For the latter, we
+follow the recommended tuning procedure to find an appropriate value in the
+range {1, 2, 5, (10, 15)}, where we heuristically increased the search space.
+For simplicity and computational reasons, we use TextCNN as base architecture.
+However, in order to correctly apply spectral normalization to convolutional
+filters [151], we had to re-implement TextCNN(v2) with 2D convolutions and
+maxpooling. This in turn requires specifying a fixed sequence length in advance,
+which invalidates directly comparing to the experiment results of Section 3.5. We
+additionally re-train base models with TextCNN(v2) and combine SNGP with
+our Regularized baseline (Reg), with MC Dropout (MCD), Concrete Dropout
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0120.txt b/assets/txts/pg_0120.txt
new file mode 100644
index 0000000000000000000000000000000000000000..59e8081a1210a35c666716bb6a59cdf9f90985cf
--- /dev/null
+++ b/assets/txts/pg_0120.txt
@@ -0,0 +1,32 @@
+88
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+(CD) and Ensemble (Ens). For SNGP ensembles, we empirically selected s = 15
+for the base model.
+3.7.2.1
+
+SNGP Results
+
+First, we present critical difference analyses for in-domain classification (Fig.
+3.18) and novelty detection (Fig. 3.19). Ensembling SNGP models, Deep
+Ensemble SNGP, proves superior in-domain, followed by Concrete Dropout
+Ensemble with and without SNGP. For novely detection, (MC) Deep Ensemble
+is most successful with small differences between next high-ranked methods.
+To our surprise, SNGP ranks quite low on the text classification tasks, although
+in the original work it demonstrated OOD detection superior to Deep Ensemble.
+In what follows, we analyze the novelty detection ranking of SNGP, specifically
+per dataset and for multiple values of s.
+
+Figure 3.18. CD diagram of NLL for base and SNGP method combinations with a
+TextCNNv2 backbone.
+
+Figure 3.19. CD diagram of AUROC for base and SNGP method combinations with
+a TextCNNv2 backbone.
+
+In order to zoom in on the relative ranking of SNGP (combination) methods,
+we plot in Fig. 3.20 AUROC detection scores for datasets with interesting trend
+changes. Overall, SNGP underperforms on CLINC-OOS, with the exception of
+Deep Ensemble SNGP. For 20news, SNGP and Deep Ensemble SNGP rank
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0121.txt b/assets/txts/pg_0121.txt
new file mode 100644
index 0000000000000000000000000000000000000000..14eae1b0607c4905d5ce46ff6a8244bf2ac66d3f
--- /dev/null
+++ b/assets/txts/pg_0121.txt
@@ -0,0 +1,22 @@
+ADDITIONAL UNCERTAINTY APPROACHES
+
+89
+
+high, although any additional regularization with SNGP worsens detection,
+even as ensemble. For Reuters, we observe the exact opposite to 20news, with
+SNGP reporting high detection scores only when regularization is added, e.g.
+Regularized SNGP. Remarkably, this trend is reversed for the base model, with
+Unregularized scoring particularly good.
+
+Figure 3.20. AUROC scores over unique (abbreviated) methods per dataset. Error
+bars are computed over multiple runs (5 seeds) for non-ensembles.
+
+Finally, Fig. 3.21 reports on how novelty detection varies for different values
+of the spectral normalization multiplier s. As the trend lines indicate, larger
+values of s generally improve novelty detection, although AUROC varies more
+(larger shading) between methods and datasets. This observation prompts us
+to investigate the optimality of s per dataset. The right subplot shows that
+spectral norm multipliers are very dataset-dependent and that searching further
+than the originally suggested range can give great performance boosts.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0122.txt b/assets/txts/pg_0122.txt
new file mode 100644
index 0000000000000000000000000000000000000000..16d7ebe0d0b248708d8c52e7172f922db3c892db
--- /dev/null
+++ b/assets/txts/pg_0122.txt
@@ -0,0 +1,38 @@
+90
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+Figure 3.21. Left: AUROC scores (y-axis) over all datasets with unique runs plotted
+for base (s = 0) and SNGP TextCNNv2 models with varying spectral normalization
+multipliers (x-axis). Lines with shading indicate the trend observed between AUROC
+and s. Right: AUROC mean and stddev over runs, sampling and datasets.
+
+3.7.2.2
+
+SNGP Discussion
+
+While SNGP was reported to outperform Deep Ensemble in the original CLINC
+OOD detection experiments [283], our results do not deliver the same ranking.
+While investigating the interaction of SNGP with different uncertainty methods,
+we observe the nontrivial role of spectral normalization, specifically setting the
+norm multiplier s to an appropriate value. Additionally, we contribute the
+analysis of the interplay with additional regularization mechanisms, which
+was missing in the literature. The original work mentions that given an
+approximation with the power iteration method, there is not a precise control
+of the true spectral norm. Whereas spectral normalization keeps the magnitude
+of updates to weights in check, Dropout regularization and weight decay may
+rescale layers’ spectral norm in unexpected ways. We hope our experimentation
+demonstrates the need for deeper understanding of how to combine multiple
+regularization mechanisms and maintain a good spectral norm approximation
+for effective posterior approximation.
+
+3.8
+
+Limitations
+
+As with the majority of benchmarking literature in Bayesian Deep Learning,
+the design of the current study is subject to limitations.
+The first limitation concerns selection bias for text classification datasets. We
+benchmark 6 prototypical text classification datasets covering binary, multiclass, and multi-label classification by topic, sentiment and intent. The task
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0123.txt b/assets/txts/pg_0123.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b0da6c8837ec352152d8b5ff575ddbbc602da1b2
--- /dev/null
+++ b/assets/txts/pg_0123.txt
@@ -0,0 +1,45 @@
+CHAPTER CONCLUSION
+
+91
+
+domain of text classification is very large with additionally interesting variations
+of (i) short social media or long business document text, (ii) hierarchical or
+extreme multi-label text classification, and (iii) challenging task settings such
+as fake news detection or reading comprehension. Since these present open subproblems in text classification we did not consider them for our benchmarking
+study, yet encourage analysis for future research.
+The second limitation is related to the representativeness of uncertainty
+quantification methods. We specifically opted for scalable procedures which
+have been increasingly gaining attention by practitioners. In total we derive 18
+method combinations from two competing predictive uncertainty procedures, for
+which we already resort to statistical summaries and rank-based evaluation to
+present results. Due to computational constraints, retraining min. 5 ensembles
+of size M = 5 per dataset and per experiment setup, we did not consider a
+natural Bayesian extension of Deep Ensemble, Bayesian Ensemble [360] where
+all weight initialization is shared around a single prior. Additionally, 3.7 includes
+preliminary experiments with two new uncertainty approaches, cyclical SGMCMC [530] and SNGP [283], which are less practical to benchmark, but bring
+promising ideas for improved, high-quality uncertainty estimation.
+Finally, evaluating the quality of uncertainty quantification is an open problem
+in BDL, typically approached with proxy setups, as is the case in our benchmark
+with a focus on novelty detection and cross-domain generalization. Section 3.3.5
+presents a nuanced view of this evaluation practice. In addition, evaluating
+reliable uncertainty estimation in NLP as opposed to other modalities is
+complicated due to the discrete nature of language. Ideally, we would have
+extended our benchmark with more probing setups covering situations where
+we expect predictive uncertainty to be crucial, for instance, when dealing with
+noisy supervision/inputs or low data regimes.
+
+3.9
+
+Chapter Conclusion
+
+In general, while seeking to optimize for a well-approximated (whether or not
+Bayesian) posterior, current predictive uncertainty methods are imperfect and
+very often practically not useful. However, the need for practical and scalable
+solutions to both incorporating and evaluating the quality of uncertainty is
+huge, as it is a prerequisite to reliable automation. Uncertainty quantification
+requires modality to task-specific benchmarking to help practitioners safely rely
+on them and inform researchers to prioritize the right approaches.
+In this work, we have presented empirical evidence from benchmarking
+uncertainty methods in text classification, contributing and calling attention
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0124.txt b/assets/txts/pg_0124.txt
new file mode 100644
index 0000000000000000000000000000000000000000..daf45b2434af04f1ea5f04090cf8d298f476808c
--- /dev/null
+++ b/assets/txts/pg_0124.txt
@@ -0,0 +1,43 @@
+92
+
+BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION
+
+to the under-explored study of uncertainty quality and model robustness in
+realistic NLP data distributions.
+Interestingly, we find that general behavior of predictive uncertainty methods
+does not hold over different datasets, with method performance often tied to the
+text classification task. Overall, we cannot discern a clear winning predictive
+uncertainty procedure, yet some methods clearly perform worse. Although a
+universal methodology is absent, we observe that there are specific correlations
+between a method’s performance and the problem setting representing text
+classification task characteristics, for which we have formulated practical takehomes.
+An important contribution is the proposed novel combinations of predictive
+uncertainty methods. Our benchmarking experiments have revealed MC
+Concrete Dropout Ensemble to be overall superior at novel class and out-ofdomain detection in text classification, even with a lower ensemble size. Most
+notably, it outperforms Deep Ensemble which has leading performance in recent
+BDL surveys on image data. We linked complementary benefits of hybrid
+uncertainty estimation methods to ongoing research on NN diversity in functionspace and have provided more evidence in support of hybrid approaches. We
+have determined in an ablation study that M , ensemble size, T , number of Monte
+Carlo samples, and p, dropout probability rate, are crucial hyperparameters
+to take into consideration for improved robustness and uncertainty estimation.
+Finally, we experimentally validated predictive uncertainty methods on realworld text classification tasks, including multi-label targets, coupling our
+hypotheses and results to the NLP problem space. Crucially, we found an
+important deficiency of BERT, compared to a more simple NLP architecture
+TextCNN, with respect to novel class robustness, limiting the applicability
+of transfer learning from pretrained Transformers under the expectation of
+uncertainty and novel class instances.
+To further improve calibration and robustness in the text classification domain,
+and by extension uncertainty in NLP, we need to better understand what
+will make existing or novel uncertainty estimation techniques successful. This
+requires the development of well-motivated tooling and protocols to reliably
+assess the quality and fidelity of posterior approximation. Generally, the role
+of priors in increasingly larger models deserves more attention. While our
+work focused on posterior geometry and weight-based priors in the form of
+regularization, stronger, more meaningful functional priors exist, which should
+be exploited to encourage desirable predictive behavior such as robustness
+to specific distribution shifts. Particularly for NLP, more focused research
+is required into what aspects —language data characteristics, inherent task
+difficulty or ambiguity, architecture design, learned representations, objectives,
+and effective parameter usage— render NLP pipelines more complex to imbue
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0125.txt b/assets/txts/pg_0125.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d2b711a17ada98f9f68645c5cd00ee615463d426
--- /dev/null
+++ b/assets/txts/pg_0125.txt
@@ -0,0 +1,7 @@
+CHAPTER CONCLUSION
+
+93
+
+with reliable uncertainty and guarantee future out-of-distribution robustness.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0126.txt b/assets/txts/pg_0126.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0ee7d4f9025024a183acd100746c1973805cd07d
--- /dev/null
+++ b/assets/txts/pg_0126.txt
@@ -0,0 +1,8 @@
+Part II
+
+Realistic and Efficient
+Document Understanding
+
+94
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0127.txt b/assets/txts/pg_0127.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6429fa5d07f6165fee2bd9b536b89b9391adf738
--- /dev/null
+++ b/assets/txts/pg_0127.txt
@@ -0,0 +1,27 @@
+Chapter 4
+
+Beyond Document Page
+Classification: Design,
+Datasets, and Challenges
+The contents of this chapter comes from a publication [470] that was presented
+53
+as an oral presentation at WACV 2024 ( 2042
+≈ 2.5%):
+Jordy Van Landeghem, Sanket Biswas, Matthew Blaschko, and Marie-Francine
+Moens. Beyond Document Page Classification: Design, Datasets, and Challenges.
+In Proceedings of the IEEE/CVF Winter Conference on Applications of
+Computer Vision, pages 2962–2972, 2024
+Disclosing the work done:
+I conceptualized the work, implemented the experiments, and wrote the
+manuscript. Sanket Biswas helped with related work and polishing the writing,
+and we acknowledge help in data collection from Ruben Perez Tito and Stefan
+Larson.
+This chapter focuses on moving beyond the (self-imposed) restrictions of page
+limits, and exploring the full potential of DL for document processing. A major
+highlight is the need to bring document classification benchmarking closer to
+real-world applications, both in the nature of data tested (X: multi-channel,
+multipaged, multi-industry; Y : class distributions and label set variety) and
+
+95
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0128.txt b/assets/txts/pg_0128.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4b6c3f776dea3287f1c57756a69e80d909d1fc44
--- /dev/null
+++ b/assets/txts/pg_0128.txt
@@ -0,0 +1,31 @@
+96
+
+BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES
+
+in classification tasks considered (f : multipage document, page stream, and
+document bundle classification, ...). We start by introducing the problem of
+document classification (DC) and its importance in the larger scope of document
+understanding, for which we emphasize visually-rich documents, adopting the
+acronym VDU instead. Moreover, we identify the lack of public multipage
+document classification datasets, formalize different classification tasks arising
+in application scenarios, and motivate the value of targeting efficient multipage
+document representations.
+An experimental study on proposed multipage document classification datasets
+demonstrates that current benchmarks have become irrelevant and need to be
+updated to evaluate complete documents, as they naturally occur in practice.
+This reality check also calls for more mature evaluation methodologies, covering
+calibration evaluation, inference complexity (time-memory), and a range of
+realistic distribution shifts (e.g., born-digital vs. scanning noise, shifting page
+order). This chapter ends on a hopeful note by recommending concrete avenues
+for future improvements, pertaining to document dataset construction efforts
+and suggested methodologies.
+The work in this chapter was the trigger for the next chapter (Chapter 5),
+in which we propose a new, comprehensive DU benchmark, DUDE, that is
+more aligned with real-world applications and practices, naturally including
+multipage documents that satisfy many of this chapter’s recommendations.
+
+Figure 4.1. Overview of different classification tasks that can be found in real-world
+VDU applications, that are not sufficiently addressed in DC research. The classification
+task notation and definitions are introduced in Section 4.2.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0129.txt b/assets/txts/pg_0129.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79ffc990dda0f3d5ffad4ee414c8a873eb506129
--- /dev/null
+++ b/assets/txts/pg_0129.txt
@@ -0,0 +1,47 @@
+INTRODUCTION
+
+4.1
+
+97
+
+Introduction
+
+Visual Document Understanding (VDU) comprises a large set of skills, including
+the ability to holistically process both textual and visual components structured
+according to rich semantic layouts. The majority of efforts are directed toward
+the application-directed tasks of classification and extraction of key information
+(KIE) in visually-rich documents (VRDs). Document classification (DC) is
+a fundamental step in any industrial VDU pipeline as it assigns a semantically
+meaningful category, routes a document for further processing (towards KIE,
+fraud checking), or flags incomplete (e.g., missing scans) or irrelevant documents
+(e.g., recipe cookbook in a loan application).
+Documents are intrinsically multipaged, explaining (partly) why PDF is one of
+the most popular universal document file formats.1 While DC in information
+management workflows typically involves multipage VRDs, current public
+datasets [165, 233] only support single-page images and constitute too simplified
+benchmarks for evaluating fundamental progress in DC.
+With the advent of deep learning, the VDU field has shifted from region-based
+analysis to whole-page image analysis. This shift led to substantial improvements
+in processing document images with more complex layout variability, exposing
+the limitations of template-based methods. Our work highlights the opportunity
+and necessity of moving beyond the page limits toward evaluation on complete
+document inputs, as they prevalently occur (multipage documents, bundles,
+page streams, and splits) across various practical scenarios within real-world DC
+applications, demonstrated in Figure 4.1.
+The practical task of long document classification [372] is largely underexplored
+due to challenges in computation and how to efficiently represent large
+multimodal inputs. Additionally, the proximity to applications involves a
+larger community for conducting research, yet innovations may happen in
+isolation or are kept back as intellectual property, lacking evaluation on
+public benchmarks [147, 148], consequently hindering reproducibility and fair
+comparisons.
+Existing DC methodology is limited to single-page images, and independently and
+identically distributed (i.i.d.) settings. We propose an improved methodology
+that extends its scope to multipage images and non-i.i.d. settings. We also reflect
+on evaluation practices and put forward more mature evaluation protocols. To
+better capture the complexity of real-world document handling, we align DC
+benchmarking closer to practical applications and task formulations.
+1 PDF is the 2nd most popular file format on the web (after HTML and XHTML) following
+detected MIME types in CommonCrawl.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0130.txt b/assets/txts/pg_0130.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8cbb1f0f857ae2e1f841753c08609a21bef67f92
--- /dev/null
+++ b/assets/txts/pg_0130.txt
@@ -0,0 +1,53 @@
+98
+
+BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES
+
+Our key contributions can be summarized as follows:
+• We have redesigned and formalized multipage DC scenarios to align
+fragmented definitions and practices.
+• We construct and share two novel datasets RVL-CDIP_MP2 and
+RVL-CDIP-N_MP3 to the community for evaluating multipage DC.
+• We conduct a comprehensive analysis of the novel datasets with different
+experimental strategies, observing the promise from best-case analysis
+(+6% absolute accuracy) by targeting multipage document representations
+and inference.
+• We overview challenges stalling DC progress, giving concrete guidelines to
+improve and increase dataset construction efforts.
+
+4.2
+
+Problem Formulation
+
+We propose to use formal definitions to better align DC research with real-world
+document distributions and practices. This will help to standardize DC practices
+and make it easier to compare different methods.
+Let X denote a space of documents, and let Y denote the output space as a finite
+set of discrete labels. Document page classification is a prototypical instance of
+classification [472], where the goal is to learn an estimator f : X → Y using N
+supervised input-output pairs (X, Y ) ∈ X × Y drawn i.i.d. from an unknown
+joint distribution P (X, Y ).
+A page p is a natural classification input that consists of an image v ∈ RQ×H×W
+T
+(number of channels, height, and width, respectively) with T word tokens {ti }i=1
+ 1 1 2 2 T
+organized according to a layout structure xi , yi , xi , yi i=1 , typically referred
+to as bounding boxes, either coming from Optical Character Recognition (OCR)
+or natively encoded.
+Note that in practical business settings, VRDs are presented at inference time
+to a production VDU system in different forms:
+I.
+II.
+III.
+IV.
+V.
+
+Single page (often scanned or photographed)
+Single document
+Multiple documents
+Multiple pages (often bulk-scanned to a single PDF)
+Single image with multiple localized pages
+
+2 huggingface.co/datasets/bdpc/rvl_cdip_mp
+3 huggingface.co/datasets/bdpc/rvl_cdip_n_mp
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0131.txt b/assets/txts/pg_0131.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e0d38bbc0d71cda8991ea05d366063879d51b31c
--- /dev/null
+++ b/assets/txts/pg_0131.txt
@@ -0,0 +1,47 @@
+PROBLEM FORMULATION
+
+99
+
+Classification tasks
+In a unification attempt, we formalize the different
+classification inputs and tasks that arise in practical scenarios, as visualized in
+Figure 4.1.
+Definition 9 [Page Classification]. (I) A page (as defined above) is
+categorized with a single category. When only considering the visual modality,
+the literature refers to it as ‘document image classification’ [165]. An estimator
+for page classification with the input dimensionality (Xp ) relative to a page
+(viz., number of channels, height, and width) is defined as:
+fp : Xp → Y,
+where Y = [C] for C mutually exclusive categories.
+
+(4.1)
+
+Definition 10 [Document Classification]. (II) A document d contains a
+fixed number of L ∈ [1, ∞) pages, which do not necessarily have the same
+dimensions (height and width). Albeit a design choice, the input dimensionality
+is normalized across pages (e.g., 3 × 224 × 224). Assuming a fixed input
+dimensionality (Xd ) relative to a document (L × Q × H × W ), a document
+classifier is defined as:
+fd : Xd → Y,
+where Y = [K] for K mutually exclusive categories.
+
+(4.2)
+
+Note also the difference in label space between the two previous classification
+tasks, which can have some overlap for document types that are uniquely
+identifiable from a single page (e.g., an accident statement form).
+Definition 11 [Document Bundle Classification]. (III) A bundle b can
+contain a variable number of B documents, each with a potentially different
+amount of L pages. A bundle classifier models a sequence classification problem
+over multiple documents:
+fb : Xb → Y, where Y is a product space of B documents,
+Y = Y1 × ... × YB , with {Yj = [K] : j ∈ [B]}.
+
+(4.3)
+
+Definition 12 [Document Stream Classification]. (IV) A page stream s
+is similar to a document in terms of input (number of pages L), albeit typically
+more varied in content and page formats. Page streams can implicitly contain
+many different documents, with pages not necessarily contiguous or even in the
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0132.txt b/assets/txts/pg_0132.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9dece05fe95a1d37231865d4a37eb7d9edfe4912
--- /dev/null
+++ b/assets/txts/pg_0132.txt
@@ -0,0 +1,43 @@
+100
+
+BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES
+
+right order, as illustrated in Figure 4.1.
+fs : Xd → Y, where Y is a product space of L pages,
+Y = Y1 × ... × YL , with {Yj = [C] : j ∈ [L]}. (4.4)
+A very concrete example of how the label sets [C] and [K] can differ is in a
+loan application use-case where national registry proofs need to be sent: If two
+pages are sent with the front and back of the ID-card, fs requires two labels
+(id_front, id_back), whereas fd requires a single document label (id_card).
+A critical note is due to differentiate page stream segmentation (PSS) [128, 328,
+494] and page stream classification as defined above (fs ). PSS treats a page
+stream as a binary classification task to identify document boundaries, without
+classifying the identified documents afterward. fs considers the task in one
+stage where C is constructed in a way to send atomic units such as a wage slip
+in Figure 4.1 for individual downstream processing or it can be combined to
+a single document label from [K] based on assigned page labels. Two-stage
+processing is possible by applying PSS as an instance of a fs classifier with
+[C] = {0, 1} where 1 indicates a document boundary, followed by fd .
+Definition 13 [Page Splitting]. (V) A multipage image m contains multiple
+page objects of similar types which can have multiple orientations, page
+dimensions, and often physical overlap from poor scanning [132]. A standard
+example involves multiple receipts to be analyzed for reclaiming VAT. While a
+complete approach will consist of localizing pages (using edge/corner detection,
+object detection, or instance segmentation) and identifying page types, we will
+only focus on the latter. For instance, multipage splitting can be defined as a
+preliminary check on how many page types are present in a multipage image
+(with input dimensionality similar to a single page p):
+fm : Xp → Y, where Y = ZC .
+
+(4.5)
+
+Payment proofs such as tickets and receipts more often are packed together
+due to their compactly printed sizes, which would require splitting the unique
+documents from within a page to send individually for further processing.
+Following the national registry example. another rare yet “economical" variation
+for fd occurs when a single page contains both the front and back of the ID card
+stitched together. These edge cases (rightmost example in Figure 4.1) should
+be dealt with on a case-by-case basis for how to set up [K] (e.g., specific label:
+multi-tickets).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0133.txt b/assets/txts/pg_0133.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1932172e08a078f761781bd96f918f66fbb983d2
--- /dev/null
+++ b/assets/txts/pg_0133.txt
@@ -0,0 +1,46 @@
+BALANCING RESEARCH & APPLICATIONS
+
+101
+
+The formalisms defined above establishes a taxonomy of DC tasks, which will
+be retaken in the discussion of challenges to align DC research and applications
+(Section 4.5).
+
+4.3
+
+Balancing Research & Applications
+
+Having established a taxonomy, we further sketch the role of DC in the larger
+scope of VDU, both in the applications and research context. We point to related
+VDU benchmarks and describe current DC datasets with their relevant (or missing)
+properties using the task formalizations. Next, we link to related initiatives
+in dataset construction and calls for reflection on DU practices. Finally, we
+introduce the curated DC datasets to support multipage DC (fd ) benchmarking,
+which will be used in a further experimental study.
+General Benchmarking in VDU: In any industrial application context where
+information transfer and inbound communication services are an important
+part of the day-to-day processes, a vast number of documents have to be
+processed. To provide customers with the expected service levels (in terms
+of speed, convenience, and correctness) a lot of time and resources are spent
+on categorizing these documents and extracting crucial information. Complex
+business use cases (such as consumer lending, insurance claims, real estate
+purchases, and expenditure) involve processing bundles of different documents
+that clients send via any communication channel. For example, obtaining a loan
+typically entails sending the following documents to prove solvency: a number
+of monthly pay stubs, bank statements, tax forms, and national registry proofs.
+Furthermore, not all documents are born-digital (BD), and as an artifact of the
+communication channel (bulk scans/photographs, digitization of physical mail),
+a single client communication can contain an arbitrary amount of document
+page images in an unknown order, requiring an fs classifier. Figure 4.1 provides
+an overview of the different DC tasks that arise in application scenarios, which
+are scarcely covered by DC research benchmarks (see Table 4.2). As RVL-CDIP
+is the only large-scale non-synthetic DC benchmark, we discuss it in more detail,
+other dataset descriptions can be found in Supplementary.
+Current state-of-the-art DU research based approaches [15, 187, 259] leverage the
+“pretrain and fine-tune" procedure that performs significantly well on popular
+DU benchmarks [165, 188, 197, 544] (see Table 4.1). However, their performance
+drops significantly when exposed to real-world business use cases mainly due
+to the following reasons: (1) The models are limited to modeling page-level
+context due to heavy compute requirements (e.g., quadratic complexity of
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0134.txt b/assets/txts/pg_0134.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f249a56a26181ccec64be1afb9affbdb3a02aa73
--- /dev/null
+++ b/assets/txts/pg_0134.txt
@@ -0,0 +1,252 @@
+102
+
+BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES
+
+Dataset
+
+Size
+
+Data Source
+
+Domain
+
+Task
+
+IIT-CDIP [252]
+RVL-CDIP [165]
+RVL-CDIP-N [241]
+TAB [328]
+FUNSD [197]
+SP-DocVQA [308]
+OCR-IDL [40]
+FinTabNet [543]
+Kleister-NDA [432]
+Kleister-Charity [432]
+DeepForm [435]
+TAT-QA [550]
+PubLayNet [544]
+DocBank [261]
+PubTabNet [545]
+DUDE [468]
+Docile [422]
+CC-PDF [460]
+
+35.5M
+400K
+1K
+44.8K
+199
+12K
+26M
+89.7K
+3.2K
+61.6K
+20K
+2.8K
+360K
+500K
+568K
+40K
+106K
+1.1M
+
+UCSF-IDL
+UCSF-IDL
+Document Cloud
+UCSF-IDL
+UCSF-IDL
+UCSF-IDL
+UCSF-IDL
+Annual Reports S&P
+EDGAR
+UK Charity Commission
+FCC Inspection
+Open WorldBank
+PubMed Central
+arxiv
+PubMed Central
+Mixed
+EDGAR & synthetic
+Common-Crawl (2010-22)
+
+Industry
+Industry
+Industry
+Industry
+Industry
+Industry
+Industry
+Finance
+US NDAs
+Legal
+Forms broadcast
+Finance
+Scientific
+Scientific
+Scientific
+Multi-domain
+Industry
+Multi-domain
+
+Pretrain
+DC
+DC
+DC
+KIE
+QA
+Pretrain
+TSR
+KIE
+KIE
+KIE
+QA
+DLA
+DLA
+TSR
+QA
+KIE
+Pretrain
+
+OCR
+
+Layout
+
+7
+7
+7
+7
+3
+3
+3
+7
+3
+3
+3
+3
+7
+3
+7
+3
+3
+7
+
+7
+7
+7
+7
+7
+7
+7
+3
+7
+7
+7
+7
+3
+3
+3
+7
+7
+7
+
+Table 4.1. DU Benchmarks with their significant data sources and properties.
+Acronyms for tasks DC: Document Classification DLA: Document Layout Analysis
+KIE: Key Information Extraction QA: Question Answering TSR: Table Structure
+Recognition
+Dataset
+
+Purpose
+
+NIST [98]
+MARG [290]
+Tobacco-800 [553]
+TAB [328]
+Tobacco-3482 [232]
+RVL-CDIP [165]
+RVL-CDIP-N [241]
+RVL-CDIP-O [241]
+RVL-CDIP_MP
+RVL-CDIP-N_MP
+
+fs
+fs
+fs
+fs
+fp
+pretraining, fp
+fp , OOD
+fp , OOD
+fd
+fd , OOD
+
+#d
+
+#p
+
+|Y|
+
+Language
+
+Color depth
+
+±400K
+1002
+
+5590
+1553
+800
+44.8K
+3482
+400K
+1002
+3415
+E[L] = 5
+E[L] = 10
+
+20
+2
+2
+2
+10
+16
+16
+1
+16
+16
+
+English
+English
+English
+English
+English
+English
+English
+English/Mixed
+English
+English
+
+Grayscale
+RGB
+Grayscale
+Grayscale
+Grayscale
+Grayscale
+RGB
+RGB
+Grayscale
+RGB
+
+Table 4.2. Statistical Comparison of public and proposed extended multipage DC
+datasets. OOD refers to out-of-distribution detection. #d and #p refer to number of
+documents or pages, respectively. For the novel MP datasets, we report the average
+number of pages.
+
+self-attention [473]), effectively treating each document page as conditionally
+independent and potentially missing out on essential classification cues. (2) The
+methods are heavily reliant on the quality of OCR engines to extract spatial local
+information (i.e. mostly at word level) suitable to solve downstream benchmark
+tasks; but fail to generalize well on business documents. (3) Existing datasets
+used for pretraining [165, 252] are different in terms of domain, content, and
+visual appearance from many downstream DC tasks (detailed in Section 4.5.3).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0135.txt b/assets/txts/pg_0135.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8321da50d5d8e0fbd6eeb2551574e7d8f378107c
--- /dev/null
+++ b/assets/txts/pg_0135.txt
@@ -0,0 +1,37 @@
+BALANCING RESEARCH & APPLICATIONS
+
+103
+
+Therefore, it can be challenging for industry practitioners to choose a specific
+model to fine-tune for the DC use cases and task specifics that they commonly
+encounter.
+RVL-CDIP The Ryerson Vision Lab Complex Document Information
+Processing [165] dataset used the original IIT-CDIP (The Illinois Institute
+of Technology dataset for Complex Document Information Processing) [252]
+metadata to create a new dataset for document classification. It was created
+as the equivalent of ImageNet in the VDU field, which invited a lot of multicommunity (Computer Vision, NLP) efforts to solve this dataset. It consists of
+low-resolution, scanned documents belonging to one of 16 classes such as letter,
+form, email, invoice.
+Proposed Datasets RVL-CDIP_MP is our first contribution to retrieve the
+original documents of the IIT-CDIP test collection which were used to create
+RVL-CDIP. Some PDFs or encoded images were corrupt, which explains that
+we have around 500 fewer instances. By leveraging metadata from OCR-IDL
+[40], we matched the original identifiers from IIT-CDIP and retrieved them
+from IDL using a conversion. However, the same caveats for RVL-CDIP apply.
+RVL-CDIP_MP-N can serve its original goal as a covariate shift test set, now for
+multipage document classification. We were able to retrieve the original full
+documents from DocumentCloud and Web Search. As no existing large-scale
+datasets include granular page-level labeling (in terms of [C]) for multipage
+documents, we could not create a benchmark for evaluating fs . Appendix B
+points to visualizations from the proposed datasets.
+Related Initiatives General benchmarking challenges have driven the VDU
+research community to set the seed for initiatives to create its own documentoriented “ImageNet” [399] challenge over which multiple long-term grand
+challenges can be defined (deepdoc2022, scaldoc2023). In another task paradigm,
+DocuVQA, there have been efforts in the same spirit to redirect focus to
+multipage documents [451, 467]. For the task of KIE, [424] launched a similar call
+for practical document benchmarks closer to real-world applications. While these
+initiatives demonstrate a similar-looking future direction, our contribution goes
+beyond introducing novel datasets and seeks to guide the complete methodology
+of DC benchmarking.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0136.txt b/assets/txts/pg_0136.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eaa7a4c74c5e1080719c39f4a3930ab4994b8abb
--- /dev/null
+++ b/assets/txts/pg_0136.txt
@@ -0,0 +1,71 @@
+104
+
+BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES
+
+4.4
+
+Experimental Study
+
+To classify a multipage document, one might ask the question “Why not just
+predict based on the first page? What would be the gain of processing all
+pages? What baseline inference strategies can be applied to classify a multipage
+document?". This prompted us to put these assumptions to the test in a small
+motivating study4 .
+As current public datasets only support page classification, we have extended
+some existing DC datasets to already enable testing a slightly more realistic, yet
+more complex document classification scenario (fd ).
+We have reconstructed the original PDF data of the DC datasets in Section 4.3.
+The goal of this experiment is to tease some issues and strategies when naively
+scaling beyond page-level DC. Our baseline of choice is the document foundation
+model DiT-Base [259], which as a visual-only fp is competitive with more
+compute-intensive multimodal, OCR-based pipelines [15, 187, 443].
+Inference
+
+Strategy
+
+sample
+
+first
+second
+last
+max confidence
+soft voting
+hard voting
+grid
+
+sequence
+
+grid
+document
+
+(not tested)
+
+Scope
+page
+page
+page
+page
+page
+page
+document
+document
+
+Table 4.3. Tested inference methods to classify multipaged documents and simulate
+a true document classifier fd . Scope refers to the independence assumption taken at
+inference time.
+
+Table 4.3 overviews some straightforward inference strategies. Consider the
+simplest inference strategy is to sample a given page with index l ∈ [L] (or in
+our case {1, 2, L − 1}) from ŷ l = [fp (x)]l . The sequence strategies mainly differ
+in how the final prediction ŷ is obtained from predictions per page, assuming a
+probabilistic classifier f˜p : Xp → [0, 1]K .
+MaxConf(x, y) = argmax[f˜p (x, y)]lk
+l∈[L]
+k∈[K]
+4 Code
+
+provided at: https://huggingface.co/bdpc/src
+
+(4.6)
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0137.txt b/assets/txts/pg_0137.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9aa9aae4e2183df6fda071decd7f6c44d2a1b4a6
--- /dev/null
+++ b/assets/txts/pg_0137.txt
@@ -0,0 +1,175 @@
+EXPERIMENTAL STUDY
+
+105
+
+SoftConf(x, y) = argmax
+k∈[K]
+
+L
+X
+
+[f˜p (x, y)]l
+
+(4.7)
+
+l=1
+
+HardVote(x, y) = argmax
+k∈[K]
+
+L
+X
+
+(4.8)
+
+eŷl ,
+
+l=1
+
+with e a one-hot vector of size K. The grid strategy is intuitive as we tile
+all page images in an equal-sized grid that trades off the resolution to jointly
+consume all document pages. While results in this experiment with fairly low
+grid resolution (224 x 224) are poor, variations (with aspect-preserving [247] or
+layout density-based scaling) deserve to be further explored.
+Strategy
+
+Acc↑
+
+F1↑
+
+fp $ [259]
+first
+second
+last
+MaxConf
+SoftVote
+HardVote
+grid
+
+93.345
+91.291
+87.295
+85.091
+91.407
+91.220
+85.995
+72.642
+
+93.351
+91.286
+87.305
+85.060
+91.453
+91.185
+86.182
+72.045
+
+F1M ↑
+93.335
+91.271
+87.277
+85.028
+91.344
+91.236
+85.781
+73.266
+
+ECE↓
+
+AURC↓
+
+0.075
+0.073
+0.070
+0.072
+0.124
+0.134
+0.085
+0.109
+
+0.010
+0.014
+0.029
+0.038
+0.006
+0.004
+0.018
+0.042
+
+Table 4.4. Base classification accuracy of DiT-base [259] (finetuned on RVL-CDIP)
+evaluated on the test set of RVL-CDIP_MP per baseline fd strategy. Best results per
+metric are boldfaced. $ refers to our reproduction of results.
+
+Strategy
+
+Acc↑
+
+F1↑
+
+fp [241]
+first
+second
+last
+MaxConf
+SoftVote
+HardVote
+grid
+
+78.643
+78.760
+64.939
+64.228
+76.321
+73.984
+67.480
+47.755
+
+81.947
+75.316
+58.741
+58.192
+72.855
+69.163
+63.188
+40.645
+
+F1M ↑
+60.564
+60.801
+50.773
+48.859
+57.470
+56.486
+52.235
+38.584
+
+ECE↓
+
+AURC↓
+
+0.105
+0.144
+0.132
+0.128
+0.180
+0.183
+0.110
+0.102
+
+0.076
+0.025
+0.071
+0.074
+0.042
+0.039
+0.088
+0.170
+
+Table 4.5. Base classification accuracy of DiT-base [259] (finetuned on RVL-CDIP)
+evaluated on the test set of RVL-CDIP_N_MP per baseline fd strategy. Best results per
+metric are boldfaced.
+
+Following similar calls in the VDU literature [468] to establish calibration and
+confidence ranking as default evaluation metrics, we include Expected Calibra-
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0138.txt b/assets/txts/pg_0138.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7e0dd246b6af500494b73a6960d37692b225f215
--- /dev/null
+++ b/assets/txts/pg_0138.txt
@@ -0,0 +1,66 @@
+106
+
+BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES
+
+tion Error (ECE) [156, 332, 340] to evaluate top-1 prediction miscalibration
+and Area-Under-Risk-Coverage-Curve (AURC) [138, 193] to measure selective
+(proportion of test set%) accuracy (cf. Section 2.2.3).
+Results in Tables 4.4 and 4.5 demonstrate that classifying by only the first page
+is a solid strategy, with performance dropping when considering only later pages.
+Maximum confidence and soft voting require L (pages) times more processing,
+yet attain similar performance as the best single-page prediction. However,
+this could be attributed to two factors: i) dataset creation bias since [165]
+constructed RVL-CDIP from a page of each original .tiff file, for which the label
+was kept if it belonged to one of the 16 categories, whereas RVL-CDIP-N [241]
+consistently chose the first-page; ii) documents are fashioned in a summarydetail or top-down content structure over pages. To confirm the validity of the
+latter hypothesis, more robust experiments on more fine-grained labeled DC are
+needed.
+The results from Table 4.4 and Table 4.5 can be interpreted as an upper
+bound (i.i.d.) and a loose lower bound (non-i.i.d., yet related), respectively.
+For the former, MaxConf is the most accurate, yet compared to SoftVote has
+worse AURC, potentially making SoftVote a better candidate for industry use
+where controlled risk is more valued. While this trend is not reproduced in
+RVL-CDIP_N_MP, it can be explained by the more consistent first-page labeling,
+adding distracting classification cues from later pages.
+Dataset
+RVL-CDIP_MP
+
+RVL-CDIP_N_MP
+
+Acc↑
+
+Strategy
+(∗)
+
+∆
+
+first+second
+first+last(∗)
+second+last(∗)
+first+second/last(∗)
+
+93.795 2.504
+93.675 2.384
+89.709 −1.583
+94.454 3.163
+
+first+second(∗)
+first+last(∗)
+second+last(∗)
+first+second/last(∗)
+
+83.638 4.878
+83.130 4.370
+71.545 −7.215
+84.553 5.793
+
+Table 4.6. Best-case classification accuracy indicated with (∗) when combining
+’knowledge’ over different pages. ∆ refers to the absolute difference with the first page
+only.
+
+To answer what can be gained from processing a multipage document in a single
+shot, Table 4.6 reports a best-case error analysis, where a page prediction is
+counted as correct if the model would have had access to the other pages. This
+is calculated by using a bit-wise OR operation between the one-hot vectors
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0139.txt b/assets/txts/pg_0139.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b153178b2d7eaf7a24a09345bd9fd619b7572341
--- /dev/null
+++ b/assets/txts/pg_0139.txt
@@ -0,0 +1,48 @@
+CHALLENGES AND GUIDELINES
+
+107
+
+(I[y == ŷ]) expressing correctness for each strategy model. As a proof of concept,
+this shows that targeting multipage document representations and inference is
+a promising avenue to improve DC.
+
+4.5
+
+Challenges and Guidelines
+
+Following the introduced task formalizations of Section 4.2, we claim that the
+distribution on which document classification is currently evaluated publicly
+and the real-world distributions have heavily diverged. Additionally, our
+experimental validation on the novel datasets demonstrated the potential
+of multipage DC, empirically reinforcing our call to action on improving
+DC methodologies. Let P A (X, Y ) and P R (X, Y ) denote those two distinct
+distributions, real-world applications and research respectively. Further, we will
+characterize the specific divergences with concrete examples and suggestions for
+better alignment.
+
+4.5.1
+
+Divergence of Tasks: f
+
+The challenge of directly processing multipaged documents is typically avoided
+by current DC models which only support single-page images [15, 153, 187, 216,
+247, 263, 371, 443]. Whenever a new DU model innovation happens, the impact
+for document classification is publicly only measured on the first task scenario
+(e.g., fp on RVL-CDIP), whereas production DU systems more often need to
+deal with the other settings (II,III,IV,V) in Figure 4.1. Moving beyond the
+limited page image context will test models’ ability to sieve through potentially
+redundant and noisy signals, as the classification can be dependent on very
+local cues such as a single title on the first page or the presence of signatures on
+the last page. Without any datasets to test this ability, we also cannot blindly
+assume that we can simply scale fp classifiers to take in more context or that
+aggregating isolated predictions over single pages is a future-proof (performant
+and efficient) strategy, as our experiments have shown.
+While p is a natural processing unit for humans, acquiring supervised annotations
+for every single page can be more expensive than attaching a single contentbased label (from [K]) to a multipage document. However, fine-grained labeling
+with fs could allow for more targeted and constrained KIE, as knowing a
+certain page l has label y l = id_front ∈ [C] will allow you to focus on specific
+entities such as national registry number, date/place of birth. Ultimately, these
+classification task formulations can also help one consider how to set up f
+directly and annotate document inputs, depending on the DC use-case.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0140.txt b/assets/txts/pg_0140.txt
new file mode 100644
index 0000000000000000000000000000000000000000..acec94c569d8e26aec7ce07f191ef7cd9224f0e4
--- /dev/null
+++ b/assets/txts/pg_0140.txt
@@ -0,0 +1,46 @@
+108
+
+4.5.2
+
+BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES
+
+Divergence of Label Space: Y
+
+Current benchmarks often use simplified label sets that are difficult to reconcile
+with industry requirements. While RVL-CDIP is the de facto standard for
+measuring performance on fp DC, recent research [242] has revealed several
+undesirable characteristics. It supports only 16 labels that pertain to a limited
+yet generic subset of business documents, which is far from the 1K classes in
+ImageNet on whose image it was modeled. Real-world DC use cases typically
+support a richer number of classes (K ∼ 50-400). RVL-CDIP suffers from
+substantial label noise, estimated to be higher than current state-the-art fp
+error rates (see [242] for a detailed analysis) which are overfit to noise. Due
+to the absence of original labeling guidelines, the labels in RVL-CDIP can be
+ambiguous, containing disparate subtypes (e.g., business cards in the resume
+category), and inconsistencies between classes (cheques present in both budget
+and invoice). Other errors include (near-)duplicates causing substantial overlap
+between train and test distributions, corrupt documents, and plain mislabeling.
+However, many common CV benchmarks are plagued by similar issues [31] and
+would benefit from relabeling campaigns [519] to maintain their relevance.
+Considering the above, multi-label classification (not covered explicitly in
+Section 4.5.1) could be a solution to resolve label ambiguities, yet this requires
+absolute consistency in label assignments, which when lacking introduces even
+more label noise. The highest labeling quality could arise from consistent
+labeling at the page level and hierarchically aggregating page labels (C → K),
+yet granular annotations are more expensive to obtain. Alternatively, it may
+be better to follow the mutually exclusive and collectively exhaustive (MECE)
+principle [72] to construct label sets at the document level.
+Finally, an overlooked aspect of current benchmarks is that label sets [K] can
+be constructed based on some business logic, where a very local cue can lead
+to a class assignment such as some checked box on page 26. Admittedly, this
+does conflate the tasks of document object detection, KIE, and DC within a
+single label set. However, the current focus on classes with plenty of evidence
+across a document, with more global classification cues, should be balanced
+with document types that rely on local cues.
+Taking the above issues into account, the community should work together
+towards developing more effective and realistic DC datasets that better align
+with the needs of industry practitioners. While tackling the challenge of Y
+divergence was out-of-scope for the contributed datasets, the next Subsection
+gives systematic recommendations for obtaining better future DC benchmarks.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0141.txt b/assets/txts/pg_0141.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8c5823faa90cd073901bc84c5ff6ac7be9fc4d32
--- /dev/null
+++ b/assets/txts/pg_0141.txt
@@ -0,0 +1,40 @@
+CHALLENGES AND GUIDELINES
+
+109
+
+Figure 4.2. Divergence of input data. The first image is an example from DC
+benchmark RVL-CDIP [165], the second one from Docile [422] for KIE, while the
+third one comes from Info-VQA [310], illustrating the visual-layout richness of modern
+VRDs vs. the monotonicity of most DC document data.
+
+4.5.3
+
+Divergence of Input Data: X
+
+We offer suggestions for future benchmark construction efforts such that they
+take into account what properties are currently unaccounted for, organically
+improving on our first pursuit towards multipage DC benchmarking.
+We argue that current VDU benchmarks fail to account for many real-world
+document data complexities: multiple pages, the distinction between born-native,
+(mobile) scanned documents, accounting for differences in quality, orientation,
+and resolution. Additionally, the UCSF Industry Document Library (and
+in consequence all DC datasets drawn from this source) contains mostly old
+(estimated period 1950s to 2002), type-written black and white documents, while
+in reality, modern documents can have multiple channels, colors, and (embedded)
+fonts varying in size, typeface, typography. Recently, there have been efforts to
+collect more modern VRD benchmarks for tasks such as DocVQA [310, 468],
+KIE [422], DLA [362]. Modern VRDs contain visual artifacts such as logos,
+checkboxes, barcodes, and QR codes; geometric elements such as rectangles,
+arrows, charts, diagrams, ..., all of which are not frequently encountered with the
+same variety in current benchmarks. Future DC benchmarks should incorporate
+modern VRDs to bring more diversity and variability in input data.
+When developing DU models, it is therefore important to consider the role of
+vision, language, and layout and how these are connected to the classification
+task. For example, current datasets are based on tobacco industry documents
+containing very domain-specific language, which a less robust classifier can overfit
+(e.g., the spurious cue of a particular cigarette brand indicates an invoice). We
+highlight that document data can be multi-lingual, and code-switching is fairly
+common in document-based communications. For instance, an email may be in
+one language while the attachment is in another language.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0142.txt b/assets/txts/pg_0142.txt
new file mode 100644
index 0000000000000000000000000000000000000000..501eb1a86df94e59ca79de6c3de066dc2eb734b5
--- /dev/null
+++ b/assets/txts/pg_0142.txt
@@ -0,0 +1,41 @@
+110
+
+BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES
+
+In summary, future benchmarks must contain multipage, multi-type, multiindustry (e.g., retail vs. medical invoice), multi-lingual documents with a wide
+range of document data complexities to build and test generic DC systems.
+The community should explore potential solutions to the lack of adequate
+datasets for testing DC models such as i) leveraging public document collections,
+ii) synthetic generation, and iii) anonymization.
+Public document collections: There are increasingly more (non-profit)
+organizations (e.g., DocumentCloud), governments (SEC EDGAR), financial
+institutions (World Bank Documents & Reports), and charities (Guidestar) that
+make business-related documents publicly available for transparency in their
+operations and archival/research purposes. These collections provide datasets
+that are closer to real-world scenarios. However, these documents are typically
+unlabelled, although annotations could be crowd-sourced through combined
+funding from interested parties. Since most document data sources restrict
+automated crawling or document scraping, future dataset constructions will
+require some cooperation and creativity, whilst fulfilling licensing, ethical, and
+legal requirements. A specific highlighted initiative is CC-PDF [460], which
+collected modern, multi-lingual VRDs from CommonCrawl for future use.
+Data synthesis: This alternative was suggested by prior work on KIE [30, 424]
+and DLA [37] for generating business and scientific documents. [422] followed up
+on this, delivering a large-scale KIE dataset with 6K real documents annotated
+and 100K synthetic examples. However, synthetic generation can be challenging
+to simulate real-world documents with similar data and classification complexity.
+Anonymization can be a viable option to construct a DC dataset without
+compromising ethical guidelines and privacy regulations. This process involves
+removing, masking, replacing, or obfuscating data so that document content
+can no longer be attributed to an individual or entity. For example, one should
+remove names, addresses, and identifying information such as social security
+numbers or replace it with a textual tag ([social-security-number]) or similar
+pattern (e.g., Faker). While this process is not viable for creating KIE datasets,
+KIE can play a big role in semi-automatically anonymizing documents [143, 366].
+Companies may be hesitant to make document collections public due to concerns
+about privacy, confidentiality and GDPR compliance. While anonymization can
+be an effective method, it should be approached with caution as potential risks
+of re-identification can make someone with originally good intentions legally
+liable. A potential side-step can be investing in privacy-preserving federated
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0143.txt b/assets/txts/pg_0143.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e5335b97ca9f5bcc48551a01e2f190c0c50bdbb9
--- /dev/null
+++ b/assets/txts/pg_0143.txt
@@ -0,0 +1,47 @@
+CHAPTER CONCLUSION
+
+111
+
+learning (e.g., PFL-DocVQA) to allow access to private industry document
+data.
+
+4.5.4
+
+Maturity of Evaluation Methodology
+
+Most DC models are evaluated using predictive performance metrics such as
+accuracy, precision-recall, and F1-score on i.i.d. test sets. However, in user-facing
+applications, calibration can be as important as accuracy [156, 332, 340]. Even
+more so, when the confidence estimation of a DC is used to triage predictions to
+either an automated flow or manual processing by a human. Once a DC is in
+production, the i.i.d. assumption will start to break, which would recommend
+a priori testing of robustness against various sources of noise (OCR, subtle
+template changes, wording or language variations, ...) and expected distribution
+shifts (born-digital-scanning artifacts, shifting page order, page copies, irrelevant
+or out-of-scope documents, novel document classes, concept drift, ...).
+Nevertheless, we observe only a few applications in DC (only reported on fp ) of
+more mature evaluation protocols [193] beyond predictive performance. Notable
+exceptions include covariate shift detection from document image augmentations
+[304], sub-class shift and generalization in [241, RVL-CDIP-N], out-ofdistribution detection [241, RVL-CDIP-O], and cross-domain generalization
+[23, (RVL-CDIP ↔ Tobacco-3482)]. However, the results on the latter can
+be misleading as both datasets are drawn from a similar source distribution.
+Another gap in DC benchmarking concerns evaluating selective classification
+[138, 193], which is closer to the production value evaluation of how many
+documents can be automated without any human assistance.
+Another interesting evaluation protocol concerns out-of-the-box performance
+or how data-hungry/sample-efficient a certain model is. In practice, few-shot
+learning from minimal annotations is a highly valued skill. This few-shot learning
+evaluation protocol has been applied in [402] with different data regimes. Finally,
+inference complexity (time-memory) has been brought back to the attention
+of OCR-free models [216], which we believe will be the key to measuring when
+scaling solutions to multipage documents.
+
+4.6
+
+Chapter Conclusion
+
+Our work represents a pivotal step forward in establishing multipage DC by
+proposing a comprehensive benchmarking and evaluation methodology. Thereby,
+we have addressed longstanding challenges and limitations (Section 4.5) that
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0144.txt b/assets/txts/pg_0144.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4f7506750208659ffc93b9488e9c26852b4d79b3
--- /dev/null
+++ b/assets/txts/pg_0144.txt
@@ -0,0 +1,19 @@
+112
+
+BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES
+
+have hindered progress in the field. As motivated in our experimental study,
+we have proven the need to advance multipage document representations and
+inference.
+Following up on this, we provide recommendations for future DC dataset
+construction efforts pertaining to the type and nature of document data, variety
+in and quality of the classification label set, with a focus on particular DC
+scenarios closer to applications, and finally how future progress should be
+measured. Nonetheless, we are hopeful that the VDU community can come
+together on these shortcomings and apply the lessons from this reality check.
+Extending the applicability of current state-of-the-art models in VDU to multipage
+documents needs further exploration, which will go hand in hand with benchmark
+creation initiatives or incorporating multiple DC task annotation layers on a
+single dataset.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0145.txt b/assets/txts/pg_0145.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5d49384de141b603aa3f34163d3f1d0866f16532
--- /dev/null
+++ b/assets/txts/pg_0145.txt
@@ -0,0 +1,3 @@
+113
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0146.txt b/assets/txts/pg_0146.txt
new file mode 100644
index 0000000000000000000000000000000000000000..68d9d09ebe97a1cae925f380f194163641dda0ab
--- /dev/null
+++ b/assets/txts/pg_0146.txt
@@ -0,0 +1,42 @@
+114
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+Chapter 5
+
+Document UnderstanDing of
+Everything (DUDE
+)
+The contents of this chapter come from two publications [468, 469]:
+Jordy Van Landeghem, Rubèn Tito, Łukasz Borchmann, Michał Pietruszka, Dawid Jurkiewicz,
+Rafał Powalski, Paweł Józiak, Sanket Biswas, Mickaël Coustaty, and Tomasz Stanisławek.
+ICDAR 2023 Competition on Document UnderstanDing of Everything (DUDE). In
+International Conference on Document Analysis and Recognition, pages 420–434. Springer,
+2023
+Jordy Van Landeghem, Rubèn Tito, Łukasz Borchmann, Michał Pietruszka, Pawel Joziak,
+Rafal Powalski, Dawid Jurkiewicz, Mickaël Coustaty, Bertrand Anckaert, Ernest Valveny,
+Matthew Blaschko, Marie-Francine Moens, and Tomasz Stanisławek. Document Understanding
+Dataset and Evaluation (DUDE). In Proceedings of the IEEE/CVF International Conference
+on Computer Vision, pages 19528–19540, 2023
+
+The first publication on the Document UnderstanDing of Everything (DUDE)
+competition was selected for oral presentation at ICDAR 2023. The second
+publication on the DUDE dataset and benchmark was featured as a poster
+presentation at ICCV 2023.
+This multi-party collaboration (6 universities and 3 companies) with many
+brilliant researchers involved the creation of a new dataset and benchmark, the
+organization of a competition, and the publication of the results. For clarity,
+we will refer to the DUDE competition as the ICDAR 2023 competition, and
+the DUDE dataset and benchmark as the ICCV publication.
+Author declarations: https://drive.google.com/file/d/1AmSxTOLk1Lo61sgWLd5FN5OMNQEgam_v
+In short, I conceptualized the project, was responsible for the dataset creation,
+annotation, and benchmarking (encoder-only models, T5, HiVT5), designed
+evaluation and confidence estimation, and wrote the majority of the ICDAR
+and ICCV paper.
+The dataset is available: https://huggingface.co/datasets/jordyvl/DUDE_loader.
+Benchmark code is available: https://github.com/rubenpt91/MP-DocVQA-Framework.
+The competition remains open for submissions at: https://rrc.cvc.uab.es/?ch=23.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0147.txt b/assets/txts/pg_0147.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f1ceb374b56009a47363dcb1c6a951deba29c093
--- /dev/null
+++ b/assets/txts/pg_0147.txt
@@ -0,0 +1,48 @@
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+115
+
+Document UnderstanDing of Everything (DUDE) is a concept rooted in both
+machine learning and philosophy, seeking to expand the boundaries of document
+AI systems by creating highly challenging datasets that encompass a diverse
+range of topics, disciplines, and complexities. Inspired by the philosophical
+‘Theory of Everything’, which aims to provide a comprehensive explanation of
+the nature of reality, DUDE endeavors to stimulate the development of AI
+models that can effectively comprehend, analyze, and respond to any question
+on any complex visually-rich document (VRD).
+Incorporating philosophical perspectives into DUDE enriches the approach by
+engaging with fundamental questions about knowledge understanding, and the
+nature of documents. By addressing these dimensions, researchers can develop
+AI systems that not only exhibit advanced problem-solving skills but also
+demonstrate a deeper understanding of the context, nuances, and implications
+of the information they process.
+This chapter will present the Document UnderstanDing of Everything (DUDE)
+dataset, benchmark and competition. It will be presented in a similar form as
+the ICCV publication, extended with the results of the ICDAR competition.
+In line with the standpoint in the previous chapter, we call on the Document
+AI (DocAI) community to re-evaluate current methodologies and embrace the
+challenge of creating more practically-oriented benchmarks. This project aims to
+remediate the halted research progress in understanding visually-rich documents
+(VRDs). We present a new dataset with novelties related to types of questions,
+answers, and document layouts based on multi-industry, multi-domain, and
+multipage VRDs of various origins, and dates.
+Moreover, we are pushing the boundaries of current methods by creating
+multi-task and multi-domain evaluation setups that more accurately simulate
+real-world situations where powerful generalization and adaptation under lowresource settings are desired. DUDE aims to set a new standard as a more
+practical, long-standing benchmark for the community, and we hope that it will
+lead to future extensions and contributions that address real-world challenges.
+Additionally, we present the results of the DUDE competition and discuss the
+innovations demonstrated by participants. The competition was structured as
+a single task with a multi-phased evaluation protocol that assesses the few-shot
+capabilities of models by testing generalization to previously unseen questions
+and domains, a condition essential to business use cases prevailing in the field.
+Under the newly studied settings, current SOTA models show a significant
+performance gap, even when improving visual evidence and handling multipage
+documents. We conclude that the DUDE dataset proposed in this competition
+will be an essential, long-standing benchmark to further explore for achieving
+improved generalization and adaptation under low-resource fine-tuning, as
+desired in the real world. To sum up, our work illustrates the importance of
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0148.txt b/assets/txts/pg_0148.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ee34a8ce390095417e721efacda27dc75ae47ee2
--- /dev/null
+++ b/assets/txts/pg_0148.txt
@@ -0,0 +1,85 @@
+116
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+finding more efficient ways to model language, images, and layout in DocAI.
+
+#non-answerable
+
+#abstractive #counting
+
+Q: In which year does the Net
+Requirement exceed 25,000?
+
+Q: How many attorneys are listed for
+the plaintiffs?
+
+A: None
+
+A: Two
+
+#layout-navigating #graphic-intensive
+Q: Are the margins of the page
+uniform on all pages?
+A: Yes
+#multi-hop #layout-navigating
+
+#extractive #list
+
+Q: From the list of Top 10 Key
+Recovery Components, which is the
+last component listed on the second
+page?
+
+Q: What are the Years mentioned in
+Chart 1?
+A: [2020, 2021, 2022]
+
+A: Hope
+
+...
+Text
+
+#abstractive #graphic-intensive
+Q: Does this document contain any
+checkboxes?
+
+Page 1
+
+Page 2
+
+Page N
+
+A: No
+
+Figure 5.1. QA as a natural language interface to multipage VRDs.
+
+5.1
+
+Introduction
+
+Early stages of research and growth in any field are characterized by enacting
+proof-of-concept and demonstrating the feasibility of the proposed solution. In
+the Deep Learning era, this is often echoed by building narrow and simplified
+datasets that do not reflect real-world complexity, leading to models that may
+not be suitable for practical use.
+The field of Document Understanding (DU) is not an exception to the recent
+proliferation of deep architectures, which in this case are predominantly used for
+classification and information extraction from documents. However, the wide
+and complex nature of documents presents many challenges that remain unsolved
+or not yet addressed. One such challenge is domain generalization, where a
+model trained on medical documents may not be directly applicable to financial
+or tabular content. Another challenge concerns task-agnostic architectures,
+where a model must be able to adapt to various DU subtasks such as document
+classification, key information extraction (KIE), and question answering (QA).
+Lastly, the high variability of document contents and layouts often leads to
+highly imbalanced samples within document types, resulting in a long-tailed
+distribution with few or almost no samples to train a model.
+Despite the importance of these challenges, there is currently no DU benchmark
+dataset that simultaneously addresses all of these issues. This paper proposes a
+novel dataset formulated as an instance of Document Visual Question Answering
+(DocVQA) to evaluate how well current DU solutions deal with multipage
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0149.txt b/assets/txts/pg_0149.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d32a6086558a808b29418ede46e6bc7b2eab8a86
--- /dev/null
+++ b/assets/txts/pg_0149.txt
@@ -0,0 +1,47 @@
+RELATED WORK
+
+117
+
+documents, if they can navigate and reason over visual layouts, and if they can
+generalize their skills to different document types and domains.
+The data collection and evaluation design of DUDE naturally motivates
+targeting models that can answer natural yet highly diverse questions (e.g.,
+regarding document elements, their properties, and compositions) for any VRD
+(e.g., drawn from potentially unseen distributions of layouts, domains, and types).
+The presented problem setting relates to Multi-Domain Long-Tailed Recognition
+(MDLT) [507], which concerns learning from multi-domain imbalanced data
+whilst addressing label imbalance, divergent label distributions across domains,
+and possible train-test domain shift. Put plainly, since we cannot provide
+ground truth QA pairs for, e.g., stamps, on every document type (domain),
+we expect a solution to transfer the subtask ’stamp detection’ learned on
+document types where stamps naturally occur (and thus training QA pairs were
+created organically) to other domains. The DocVQA and MDLT formulations
+of DUDE allow us to create a longstanding, challenging benchmark that in
+the future can be easily extended with more subtasks formulated as QA pairs,
+and domains relating to document types (see Limitations).
+The contribution of this work is twofold. First, we have created DUDE, a novel
+large-scale, multipaged, multi-domain, multi-industry DocVQA benchmark for
+evaluating DU progress. Second, we show that the zero-shot and fine-tuned
+performance of current SOTA models applied to DU lags far behind human
+baselines, explained in part by the need for more holistic and efficient modeling
+of language, vision, and richly structured layouts.
+
+5.2
+
+Related Work
+
+Document Understanding encompasses datasets related to various subtasks
+like document layout analysis [261, 544], classification [165], key information
+extraction [197, 432], table extraction [427, 543, 545], and visual question
+answering [308, 315, 450]. These benchmarks lead to end-to-end DU
+architectures that have transformed common DocAI practices [15, 134, 153, 187,
+263, 365, 371]. These task-specific benchmarks, however, are often tailored to
+a single domain, limiting the ability to create and assess how well DU models
+generalize to other document types and domains. To fill this gap, we adopt
+a visual question answering (VQA) approach, which has been crucial in the
+growth of the DU field.
+The VQA paradigm provides a natural language interface for various
+tasks from both computer vision and natural language processing. In
+the latter, the question-answering approach has been successfully used in
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0150.txt b/assets/txts/pg_0150.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6194996834c021ebd0314fddcd8aef2315c54c9d
--- /dev/null
+++ b/assets/txts/pg_0150.txt
@@ -0,0 +1,45 @@
+118
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+several domains, including medicine [202, 209, 257, 318, 338, 352, 384], opendomain knowledge [281, 291, 313, 506], emotions [41, 155], code [7, 278],
+logical reasoning [282, 504, 516, 534], claim verification [185, 446, 523], and
+math [10, 65, 182, 316, 529]. As a result of its ability to function as a natural
+language interface for various forms of data, this paradigm has been applied to
+other domains. For example, the question-answering approach is combined with
+modalities such as images [13, 38, 39, 161, 353, 513], speech [237, 514], knowledge
+graphs [106, 206, 408, 429, 457], videos [58, 59, 74, 158, 249], and maps [60, 359].
+Overall, the convergence of computer vision and NLP through the emergence of
+VQA tasks has also opened up new avenues for research in the DU field, with
+many DU datasets now including rich visual content alongside questions. Yet,
+prior study on document VQA has mainly focused on single-page documents [308,
+310, 449] with rare exceptions such as MP-DocVQA [451]. However, [308, 449]
+pose only extractive questions where the answer follows the context on which the
+question is defined as in other question answering benchmarks [235, 386, 456].
+Moreover, these datasets do not contain non-answerable questions as in established (natural language) QA datasets like [235, 387]. To the best of our knowledge there are no VQA datasets containing questions requiring lists as an answer.
+There are however few text-only QA datasets that contain such answer types [83,
+256, 357]. Other datasets mainly related to our work are rather domain-specific
+like [310, 375, 440, 441, 551]. We give a detailed comparison of most related
+document VQA datasets in Table 5.1 highlighting the major contributions.
+
+5.3
+
+DUDE Dataset
+
+While DUDE shares some similarities with existing VQA datasets, a closer
+comparison (see Table 5.1) highlights its unique features. We are confident
+that the model’s proficiency in the areas introduced in this work will showcase
+its capability to handle the intricacy and diversity of document understanding
+tasks in real-world scenarios.
+Documents. The dataset covers a wide range of document types, sources and
+dates, as shown in Table 5.1 and Figure 5.2 where its diverse nature is confirmed
+by the spread of document content representations. Moreover, it covers a broad
+range of domains, including medical, legal, technical, and financial, among
+others, to evaluate models’ ability to handle diverse topics and the specific
+knowledge each requires. Furthermore, the dataset contains documents with
+varying layouts: diverse text arrangements, font sizes, and styles, to ensure that
+models can handle visually diverse documents.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0151.txt b/assets/txts/pg_0151.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3ec391fbb42c7193c0bd659ea12bed9d390c799f
--- /dev/null
+++ b/assets/txts/pg_0151.txt
@@ -0,0 +1,37 @@
+DUDE DATASET
+
+119
+
+TSNE Projection of 5641 Documents
+
+DocVQA
+InfographicsVQA
+Ours
+TAT-DQA
+VQA-CD
+VisualMRC
+
+Figure 5.2. Visualization of inter-document similarities between samples from different
+datasets (t-SNE over TF-IDF representations of 1k passages from each source).
+
+In contrast to our proposal, current VQA datasets often focus on homogeneous
+documents, such as invoices in VQA-CD [302] or financial reports in TATDQA [551]. Even when not restricted to a single domain or layout, these
+datasets share essential characteristics. For example, InfographicsVQA [310]
+demonstrates significant diversity in topics and designs, but still embodies a
+preference for visual aids over complex tables or long text passages. Moreover,
+VQA datasets are commonly restricted to either born-digital or scanned
+documents, which limits their ability to measure the robustness to mixedorigin files that one usually finds in real-world applications. In particular, this
+restriction makes it uncertain whether state-of-the-art performers on website
+fragments from VisualMRC [440] can be efficient on multi-column layouts and
+documents with OCR errors or incorrectly-detected reading orders. Finally, a
+typical dataset for document VQA contains documents from a limited period,
+i.e., a few years (Table 5.1).
+Considering the properties mentioned above, the most diverse dataset to date
+is Single Page DocVQA (SP-DocVQA) [308], which contains mixed-origin
+documents of different types created over several decades. However, it is built
+exclusively on single-page document excerpts and is limited to several domains
+represented in the Industry Documents Library. As a result, it complements
+rather than serves as a touchstone for general-purpose DU systems. MPDocVQA [451] extends this including previous and posterior pages of the
+documents. However, the questions are kept the same which makes the extra
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0152.txt b/assets/txts/pg_0152.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e5d714b62f872bd954927bcf2aabdc89f04c94c5
--- /dev/null
+++ b/assets/txts/pg_0152.txt
@@ -0,0 +1,45 @@
+120
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+pages mere distractors.
+Questions. We use VQA as a natural language interface to VRDs, challenging
+the DU model with diverse questions, advanced operations, and multi-step
+reasoning to achieve real-world success.
+Firstly, we assert that various layouts and visual elements must be comprehended
+semantically. As such, we introduce complex questions targeting these document
+elements, requiring comprehension beyond the document content, such as
+‘how many text columns are there?’, ‘does the document contain words with
+diacritics?’ or ‘which page contains the largest table in the document?’. These
+layout-navigating questions bridge the gap between Document Layout Analysis
+and Question Answering paradigms.
+Our unique and detailed compositional questions demand a model that
+comprehends semantics and generalizes to new questions in a zero-shot setting.
+For example, >90% of our questions are unique, while we target questions
+whose answer scope is much more diverse than in previous works.1 Since
+neural networks are known to perform poorly at mathematical reasoning and
+symbolical processing, we provide training and evaluation questions demanding
+arithmetic and comparison operations on numbers and dates.
+Moreover, we feature multi-hop questions that indicate a model’s robustness to
+sequential reasoning and mimic how humans ask questions. They may be useful
+in real-world tasks such as ‘If the checkbox on page 1 section 3a indicates that
+the company is incorporated, how much yearly revenue did it generate in 2022
+(given the table on page 5)?’
+Answers. Even though some VQA datasets are deliberately limited to
+questions of exclusively extractive (SP-DocVQA) or abstractive (VisualMRC)
+nature, others do not obey such restrictions and include both question types
+(see Table 5.1). The dataset we provide includes both abstractive and extractive
+answers, covering various types such as textual, numerical, dates, yes/no, lists,
+or no answer.
+This allows us to cover all possible business use cases and reveal major deficiencies
+of existing DU systems beyond typical textual answers. For instance, no existing
+VQA dataset includes not answerable questions and questions answered with a
+list. In turn, the models considered to date supposedly tend to make unreliable
+guesses on questions with an answer not entailed by the content [387]. Our
+1 Answer
+
+type comparison is included in supplementary materials.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0153.txt b/assets/txts/pg_0153.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d7d8212aebd23f11b171b54eb14e872ea8cbb873
--- /dev/null
+++ b/assets/txts/pg_0153.txt
@@ -0,0 +1,50 @@
+DUDE DATASET
+
+121
+
+dataset is designed to cover answers beyond plain extractive text such as a list
+of items or even ‘None’.
+The ‘None’ answer type demands that the model correctly identifies that the
+answer cannot be provided, as the question needs to be better formed, e.g., it
+asks about the value of an empty cell in the table. In addition, list generation
+problems pose challenges to the model, as (1) more tokens need to be generated,
+(2) they may be sourced from different places in the document, and (3) OCR
+reading order may influence the element ordering.
+
+5.3.1
+
+Gathering Documents
+
+A fundamental difficulty in gathering raw source files was ensuring dataset
+diversity while fulfilling strict licensing requirements. Therefore, rather than
+depending on initial sources of files, e.g., libraries that originally published
+digitized materials, we resorted to aggregate websites.
+The document collection process was manual and assumed formulating queries
+to archive.org (containing 36M books and texts), commons.wikimedia.org (with
+86M media types of various types), and documentcloud.org (with around 5M
+public documents). The queries consisted of keywords relevant to some category
+of interest, e.g., the resume category of our proposal consists of ‘resume’, ‘cv’,
+‘curriculum’, and ‘biography’ keywords). Where necessary, a separate query
+parameter ensured that the resulting files belonged to the public domain or
+were released under a permissive license. Information on keywords and the
+search procedure is distributed as a part of the DUDE dataset.
+From the resulting documents, we selected those representing the requested
+category and visually distinctive from the ones already gathered. Special care
+was put into removing examples that visibly expose controversial content or may
+be subject to privacy or legal concerns, despite the declared license. We collected
+five thousand, typically multipage, English documents using this methodology.
+
+5.3.2
+
+Annotation Process
+
+The annotation process involved in-house annotators and Amazon Mechanical
+Turk freelancers. For the latter, there is limited control over the expertise,
+and where justified, we resorted to limiting task availability depending on the
+number of completed tasks and historical acceptance rate.2 The former are five
+highly qualified people with a Ph.D. in Linguistics. These three annotation
+2 Approval
+
+above 97% over at least 5k HITs.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0154.txt b/assets/txts/pg_0154.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b13fa35345a25d9510fbd970a8c29f0159a473a
--- /dev/null
+++ b/assets/txts/pg_0154.txt
@@ -0,0 +1,40 @@
+122
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+scenarios will be referred to as All MTurkers, Best MTurkers, and Qualified
+Linguists.
+We estimate the total cost of annotation involving both Linguists and MTurkers
+as $20,000.
+Phase 1. We started by providing All MTurkers documents described in
+Section 5.3.1 in separate batches aimed at collecting abstractive, extractive,
+and list QA pairs. Each freelancer was asked to propose up to five questions
+of a particular type, and in the case of extractive ones to provide an evidence
+bounding box. The exception to this process is the annotation of non-answerable
+questions previously shown to be particularly challenging [387]. These are
+predominantly annotated by Qualified Linguists and because of their quality
+promoted without passing through Phases 2-3.
+Candidate QA pairs are semi-automatically filtered to exclude annotations that
+cannot be valid due to the length, use of non-typical character combinations,
+or type-specific criteria, such as non-list answers for list batches. Additionally,
+we cluster duplicate and near-duplicate question-answer pairs to ensure dataset
+diversity and promote them directly to Phase 3 after a manual review (the same
+QA pairs provided independently by several annotators indicate their validity).
+Phase 2. The rest of the annotations promoted from Phase 1 were directed
+to All MTurkers, but this time instead of providing complete QA pairs, they
+were asked to answer the question from the previous round. Obtained triples of
+questions and two answer variants (one from each phase) were evaluated using
+inter-answer ANLS (defined in Section 5.3.5) promoted to the final dataset if
+the agreement was >0.8. Otherwise, QA triples were directed to Phase 3.
+Phase 3. Best MTurkers were provided with document, question, and answer
+variants to decide the correctness of each answer and optionally overrule both
+variants if they are not correct. Outliers from decisions in this phase, such as
+repealing without a judgment on previous answers, were reviewed by Qualified
+Linguists and corrected if needed.
+Optional Phase 4. Annotations of the test set were reviewed by Qualified
+Linguists. Given data from Phase 3, they corrected questions, answers and
+created metadata related to diagnostic categories described in Section 5.3.4.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0155.txt b/assets/txts/pg_0155.txt
new file mode 100644
index 0000000000000000000000000000000000000000..39bba25868b9fed9eaee5cb20c2b5846e2a8d23c
--- /dev/null
+++ b/assets/txts/pg_0155.txt
@@ -0,0 +1,193 @@
+DUDE DATASET
+
+5.3.3
+
+123
+
+Dataset Statistics
+
+Dataset
+
+Ours
+
+SP-DocVQA
+
+Sources
+
+Multi
+
+Dataset-level properties
+Industry docs
+Web pages
+
+VisualMRC
+
+InfographicsVQA TAT-DQA
+Infographics
+
+Origin
+Period
+Documents
+Pages (avg±std )
+Tokens (avg±std )
+Simpson
+coeff.
+(ResNet)
+Simpson coeff. (Tf-Idf)
+
+BD, Scan
+1860-2022
+5,019
+5.72±6.4
+1,831.53±2,545.06
+0.82
+
+Mostly scans
+1960-2000
+12,767
+1.0±0.0
+183±149.96
+0.76
+
+BD
+Jan-Mar 2020
+10,234
+1.0±0.0
+154.19±79.34
+0.83
+
+BD
+not specified
+5,485
+1.0±0.0
+287.98±214.57
+0.86
+
+Finance
+reports
+BD
+2018-2020
+2,758
+1.11±0.32
+576.99±290.12
+0.73
+
+0.95
+
+0.93
+
+0.99
+
+0.94
+
+0.15
+
+Questions
+Unique (%)
+Length (avg±std )
+Semantics
+
+41,541
+90.9
+8.65±3.35
+All
+
+Question-level properties
+50,000
+30,562
+72.34
+96.26
+8.34±3.04
+9.38±4.01
+T, L, F, Ch
+T, L, F, Ch
+
+30,035
+99.11
+11.57±3.71
+T, L, F, Ch, M
+
+16,558
+95.65
+12.51±4.18
+T, L
+
+Unique (%)
+Length (avg±std )
+Extractive (%)
+Abstractive (%)
+List (%)
+None
+
+70.7
+3.35±6.1
+42.39
+38.25
+6.62
+12.74
+
+Answer-level properties
+64.29
+91.82
+2.11±1.67
+8.38±6.36
+100.0
+0.0
+0.0
+100.0
+0.0
+0.0
+0.0
+0.0
+
+48.84
+1.66±1.43
+71.96
+24.91
+5.69
+0.0
+
+77.54
+3.44±7.20
+55.72
+44.28
+0.0
+0.0
+
+Table 5.1. Summary of the existing English document datasets and our challenge. BD
+stands for born-digital. Layout semantics are abbreviated as (T)able, (L)ist, (F)igure,
+(Ch)art, and M(ap). Comparison based on Azure Cognitive Services (3.2) OCR.
+
+We conducted a statistical analysis of our dataset and found that the distribution
+of document length, question length, and answer type was much more diverse
+than in other datasets in the same domain. We also used the Simpson diversity
+coefficient [421] for analysis and summarized the results in Table 5.1. The
+following are the statistics for the data split:
+
+documents
+questions
+
+train
+
+val
+
+test (diagnostic)
+
+3,010
+23,728
+
+749
+6,315
+
+1,215 (530)
+11,448 (2,462)
+
+Table 5.2. Data split counts.
+
+The number of tokens in the document distribution is much more diverse
+compared to other datasets, a consequence of the more diverse distribution of
+pages (see Figure 5.4). Note some of the documents are more visual than textual
+(or even visual-only), making the left whisker essentially reach 0 (log2 -scaling
+of x-axis).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0156.txt b/assets/txts/pg_0156.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b1b730578f79ff1543e00c355c3417d4fe8042b2
--- /dev/null
+++ b/assets/txts/pg_0156.txt
@@ -0,0 +1,29 @@
+124
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+Figure 5.3. Distribution of the number of tokens in documents, answers, and questions.
+
+Figure 5.4. While other datasets are predominantly single-page only, the number of
+pages featuring in DUDE is more diverse, yet still biased towards shorter documents.
+
+The distribution of the number of tokens in answers is heavy-tailed, to some
+extent this is also the property of the distribution of number of tokens in
+questions. Furthermore, 90.9% of questions are unique, and so are 70.7% of
+answers (taking answer variants into account).
+We scrutinized the answer types by aggregating possible answers into classes
+representing the information they conveyed. The study used heuristics to
+determine if the answers fit into NER labeling scheme [20] or categories we
+anticipated, such as yes/no and none, or did not anticipate, such as color. This
+resulted in 25 different groups of answers, with the other answer type being the
+fourth largest group. Cramer’s V coefficient was used to check for correlations
+between question types and answer types, and the results indicated that there
+were few correlations . The expected correlations, such as none answers with
+not-answerable questions or yes/no answers with abstractive questions, were
+present, but barely any correlation was significant. This suggests it is hard to
+guess the answer based on the question solely.
+We study relative diversity measure, called Simpson coefficient [421, 546]. To
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0157.txt b/assets/txts/pg_0157.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4d9c2684ec636f69c0847396e16e624011fb35d0
--- /dev/null
+++ b/assets/txts/pg_0157.txt
@@ -0,0 +1,141 @@
+DUDE DATASET
+
+125
+
+Complexity
+
+Evidence
+
+Form
+
+Operation
+
+Type
+
+1800
+1752
+
+1350
+1013
+
+900
+852
+
+884
+
+843
+667
+
+643
+
+450
+
+428
+310
+
+112
+
+125
+
+79 27 65 113 25 25 58 34
+
+36 48 227 27
+
+Com
+plex
+Com
+(layo
+plex
+ut)
+(m
+Com ulti-hop)
+plex
+(othe
+r)
+Simp
+le
+Hand
+writin
+g
+Layo
+ut
+P
+Table lain
+Visua or list
+l/C
+Visua
+l / Ch hart
+ec
+Visua kbox
+l / Co
+lor
+Visua
+l/I
+Visua mage
+l/L
+Visua ogo
+l
+Visua / Map
+l
+Visua / Other
+l / St
+amp
+Date
+Num
+eric
+Othe
+Prop
+er na r
+me
+Arith
+metic
+Com
+paris
+o
+Coun n
+ting
+Norm
+aliza
+t
+Abst ion
+activ
+e
+Extra
+ctive
+
+0
+
+712
+
+696
+
+615
+
+Figure 5.5. Count of particular diagnostic categories in a subset of 2.5k test set QA
+pairs annotated in detail to help analyze models’ performance.
+
+define it, consider a fixed distance function d(a1 , a2 ) defined for pair of documents
+a1 , a2 ∈ A: the dataset. In our applications, it is the cosine similarity of a
+document embedding. Further, for an arbitrary number of datasets A1 , . . . , AN
+the diversity of A1 with respect to A2 , . . . , AN is defined as
+
+
+DivA2 ,...,AN (A1 ) = 1 − p d(a11 , a12 ) < min d(ai1 , ai2 )
+i=2:N
+
+where ai1 , ai2 ∈ Ai , are randomly selected, i = 2 : N i = 2 : N . We report
+relative diversities of each of the datasets, relative to other datasets in the study,
+based on two embeddings: visual (ResNet-101 embeddings-based) and semantic
+(Tf-Idf embeddings-based), in Table 5.1. The results show that the probability
+that two random documents from DUDE are more similar than each random
+pair of documents from other datasets is small, meaning that documents in our
+dataset are well-distributed and diverse.
+
+5.3.4
+
+Diagnostic Subsets
+
+Following previous DU datasets, we gather diagnostic metadata for close to
+half of the documents and QA pairs in the test set (see Figure 5.5). These
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0158.txt b/assets/txts/pg_0158.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dcbe09556bdace1058092acfcd89725a7cee03c
--- /dev/null
+++ b/assets/txts/pg_0158.txt
@@ -0,0 +1,42 @@
+126
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+are intended to enable a fine-grained analysis of the models’ performance. The
+taxonomy used is an extension of the one from earlier works [47, 308, 310],
+covering DUDE-specific questions and enables a more detailed examination
+of visual artifacts under consideration.
+Question type and perceived complexity. We distinguish questions perceived
+as simple, i.e., those based on spotting value near a phrase mentioned
+explicitly as a part of the question. For example, "Who is the Secretary
+of the U.S. Department of Commerce?" when the document contains "Penny
+Pritzker, Secretary, U.S. Department of Commerce." Such could be guessed
+given an approximate string matching algorithm and does not require much
+comprehension beyond that. The remaining questions are marked as hard with
+distinguished categories of hard multi-hop questions, and hard meta/layoutnavigating questions.
+Answer evidence. We provide information on what types of elements have to
+be comprehended to provide an answer, including free text, handwriting, table or
+list, and layout, i.e., non-tabular spatial understanding of text placement. These
+follow the ontology established by previous works [47, 308, 310]. In addition, we
+supply hints on graphical artifacts one needs to consider for particular questions,
+such as image/photo, plot/chart, checkbox, and annotation.
+Required operation. We distinguish arithmetic, comparison, counting, and
+normalization operations to provide information on the need for performing,
+respectively, arithmetic operations on extractable data, comparing numerical
+values or sizes, counting elements or converting data present in the document
+to another format (e.g., rounding or date format conversion).
+Answer form/shape. Finally, we provide information on the shallow form of
+the returned answer, including date, numeric, and proper name.
+
+5.3.5
+
+Evaluation
+
+The evaluation process follows the typical paradigm of separate training,
+validation, and test splits. We provide both a standalone evaluator and a
+website3 [467] to submit test set predictions.
+3 rrc.cvc.uab.es/?ch=23
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0159.txt b/assets/txts/pg_0159.txt
new file mode 100644
index 0000000000000000000000000000000000000000..387c0c8165bdb21618727f652fc46ad459b9d9b4
--- /dev/null
+++ b/assets/txts/pg_0159.txt
@@ -0,0 +1,42 @@
+DUDE DATASET
+
+127
+
+To assess models’ performance, we rely on the ANLS metric introduced by
+authors of the ST-VQA dataset [39]. Roughly speaking, it is a generalization
+of accuracy that does not penalize the system for an answer whose similarity
+to the gold standard measured with normalized Levenshtein similarity is above
+a specified threshold. Moreover, the metric assumes the presence of multiple,
+equally valid reference answers. The mentioned properties account for possible
+OCR errors or different phrasings, such as the same numerical answer represented
+as two and 2 by different annotators.
+In practice, production DU systems provide an estimation of confidence in order
+to triage documents that do not need to be manually reviewed by a human.
+While the reliability of the automation ability of a DU solution is deemed
+quintessential for generating business value in practice [48], DU research rarely
+reports any confidence evaluation. Some exceptions are in closely related task
+domains like scene text recognition [425] and QA [208, 531].
+With DUDE, we want to establish calibration evaluation and confidence ranking
+as a default evaluation methodology in DU, especially since the field is so close
+to applications.
+To this end, we report (next to ANLS) two additional metrics, Expected
+Calibration Error (ECE) [156, 332, 340], and Area-Under-Risk-Coverage-Curve
+(AURC) [138, 193].
+Calibration requires that the probability a model assigns to its predictions
+equals their true likelihood of being correct [86, 88, 520].
+ECE approximates top-1 calibration error by a weighted average over the
+accuracy/confidence difference of histogram bins. Particularly in our evaluation
+setting, we consider a predicted answer correct if its ANLS to the ground
+truth answer is above a pre-defined threshold (τ =0.5). For consistency, notanswerable and list-answers both have confidence estimated for the answer as a
+whole (regardless of the number of answers). Following [342], we apply equalsize binning (with 100 bins, Lp = 1), avoiding some pathologies of equal-range
+binning [231, 463].
+AURC is a selective classification metric that evaluates how well an estimator
+prevents silent failures on an i.i.d test set. As an aggregate measure of estimator
+performance (ANLS) and confidence ranking, it provides a more practically
+useful estimate of overall performance when the estimator can abstain from
+(low-confidence) decisions and defer to a human for feedback.
+By reporting the above metrics, we hope that in future work there will be
+contributions (e.g., calibration methods for improved forecasting or metrics for
+better predictive uncertainty evaluation) that concretely target the empirical
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0160.txt b/assets/txts/pg_0160.txt
new file mode 100644
index 0000000000000000000000000000000000000000..20e379f515e3b93d9590af48b8d166a7172fa56c
--- /dev/null
+++ b/assets/txts/pg_0160.txt
@@ -0,0 +1,48 @@
+128
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+observations of overconfidence/miscalibration in DU models.
+
+5.4
+
+DUDE Competition
+
+Over the past few years, the field of Document Analysis and Recognition (DAR)
+has embraced multimodality with contributions from both NLP and CV. This
+has given rise to DU as the all-encompassing solution [15, 187, 371] for handling
+VRDs, where layout and visual information is decisive in understanding a
+document.
+This umbrella term subsumes multiple subtasks ranging from KIE [197, 432],
+DLA [544], VQA [310, 450], table recognition [201, 376], and so on. For each of
+these subtasks, influential challenges have been proposed, e.g., the ICDAR 2019
+Scene Text VQA [38, 39] and ICDAR 2021 Document VQA (DocVQA) [308, 450]
+challenges, which in turn have generated novel ideas that have impacted the
+new wave of architectures that are currently transforming the DAR field.
+Nevertheless, we argue that the DAR community must encompass the future
+challenges (multi-domain, multi-task, multipage, low-resource settings) that
+naturally juxtapose the previous competitions with pragmatic feedback attained
+via its business-driven applications.
+
+5.4.1
+
+Challenge Objectives
+
+We aim to support the emergence of models with strong multi-domain layout
+reasoning abilities by adopting a diversified setting where multiple document
+types with different properties are present. Moreover, a low-resource setting
+(number of samples) is assumed for every domain provided, which formulated as
+a DocVQA competition allows us to measure progress with regard to the desired
+generalization (Section 5.4.3.1). Additionally, we strive for the development of
+confidence estimation methods that can not only improve predictive performance
+but also adjust the calibration of model outputs, leading to more practical and
+reliable DU solutions.
+We believe that DUDE’s emphasis on task adaptation and the capability
+of handling a wide range of document types, layouts, and complexities will
+encourage researchers to push the boundaries of current DU techniques, fostering
+innovation in areas such as multimodal learning, transfer learning, and zero-shot
+generalization.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0161.txt b/assets/txts/pg_0161.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8db50d6ab57cabf5d64c10ab9b437c2e40d259fc
--- /dev/null
+++ b/assets/txts/pg_0161.txt
@@ -0,0 +1,50 @@
+DUDE COMPETITION
+
+5.4.2
+
+129
+
+Challenge Contributions
+
+DUDE answers the call for measuring improvements closer to the real-world
+applicability of DU models. By design of the dataset and competition,
+participants were forced to make novel contributions in order to make a
+significant impact on the DU task. Competitors showcased intriguing model
+extensions, such as combining models that learn strong document representations
+with the strengths of recent large language or vision-language models (ChatGPT
+[52] and BLIP2 [258, 260]) to better understand questions and extract
+information from a document context more effectively. HiVT5 + modules
+extended Hi-VT5 [451] with token/object embeddings for various DU subtasks,
+while MMT5 employed a two-stage pretraining process and multiple objectives
+to enhance performance. These innovative extensions highlight the ingenuity in
+addressing the complex challenges of document understanding.
+
+5.4.3
+
+Motivation and Scope
+
+We posit that progress in DU is determined not only by the improvements in
+each of its related predecessor fields (CV, NLP) but even more by the factors
+connecting to document intelligence, as explicitly understood in business settings.
+To improve the real-world applicability of DU models, one must consider (i) the
+availability and variety of types of documents in a dataset, as well as (ii) the
+problem-framing methods.
+Currently, publicly available datasets avoid multipage documents, are not
+concerned with multi-task settings, nor provide multi-domain documents of
+sufficiently different types. These limitations hinder real-world DU systems,
+given the ever-increasing number of document types occurring in various business
+scenarios. This problem is often bypassed by building systems based on private
+datasets, which leads to a situation where datasets cannot be shared, documents
+of interest are not covered in benchmarks, and published methods cannot
+be compared objectively. DUDE counters these limitations by explicitly
+incorporating a large variety of multipage documents and document types.
+Furthermore, the adaptability of DU to the real world is slowed down by a
+low-resource setting, since only a limited number of training examples can be
+provided, involving unpleasant manual labor, and subsequently costly model
+development. Anytime a new dataset is produced in the scientific or commercial
+context, a new model must be specifically designed and trained on it to achieve
+satisfactory performance. At the same time, transfer learning is the most
+promising solution for rapid model improvements, while zero- and few-shot
+performance still needs to be addressed in evaluation benchmarks.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0162.txt b/assets/txts/pg_0162.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1772fd898d12217ae3acd8e9d89b43f6aad77e53
--- /dev/null
+++ b/assets/txts/pg_0162.txt
@@ -0,0 +1,51 @@
+130
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+Bearing in mind the characteristics outlined above, we formulated the DUDE
+dataset as an instance of DocVQA to evaluate how well current solutions can
+simultaneously handle the complexity and variety of real-world documents
+and all subtasks that can be expected. Optimally, a DU model should
+understand layout in a way that allows for zero-shot performance through
+attaining "desired generalization", i.e., generalization to any documents (e.g.,
+drawn from previously unseen distributions of layouts, domains, and types)
+and any questions (e.g., regarding document elements, their properties, and
+compositions). Therefore, we incorporated these criteria while designing our
+dataset, which may stand as a common starting point and a cooperative path
+toward progress in this emerging area.
+5.4.3.1
+
+Desired Generalization.
+
+The challenge presented by DUDE is an instance of a Multi-Domain LongTailed Recognition (MDLT ) problem [507].
+Definition 14 (Multi-Domain Long-Tailed Recognition). MDLT focuses on
+learning from multi-domain imbalanced data whilst addressing label imbalance,
+divergent label distributions across domains, and potential train-test domain
+shift. This framework naturally motivates targeting estimators that generalize
+to all domain-label pairs.
+A domain D = {(xi , yi )}N
+i=1 is composed of data sampled from a distribution
+PXY , where X denotes an input space (documents) and Y the output space
+(QA pairs). Each x ∈ X represents a document, forming a tuple of (v, l, t),
+expressing a complex composition of visual, layout and textual elements. For
+simplicity, consider that each ‘label’ y ∈ Y represents a question-answer pair,
+relating to implicit tasks to be completed (such as date KIE in What is the
+document date?). Due to the potentially compositional nature of QA, the label
+distribution is evidently long-tailed. During training, we are given M domains
+(document types) on which we expect a solution to generalize (Figure 5.6), both
+within (different number of samples for each unique task) and across domains
+(even without examples of a task in a given domain).
+j
+What sets apart domains is any difference in their joint distributions PXY
+=
+6
+k
+PXY . For example, an invoice is less similar (in terms of language use, visual
+appearance, and layout) to a contract than to a receipt or credit note. Yet,
+a credit note naturally contains a stamp stating information such as “invoice
+paid”, whereas receipts rarely contain stamps. This might require a system to
+transfer ‘stamp detection’ learned within another domain, say on notary deeds.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0163.txt b/assets/txts/pg_0163.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e8faa0f433c2e6ab60e689b6d61feceecc28f033
--- /dev/null
+++ b/assets/txts/pg_0163.txt
@@ -0,0 +1,34 @@
+DUDE COMPETITION
+
+131
+
+Figure 5.6. Illustration of MDLT as applicable to the DUDE problem setting. The yaxis aggregates skills related to specific KIE or reasoning tasks over document elements
+(checkbox, signature, logo, footnote, ...). The x-axis denotes the obtained samples
+(QA pairs) per task. Each domain has a different label distribution P (Y ), typically
+relating to within-domain document properties P (X).This training data exhibits label
+distribution shifts across domains, often requiring zero-shot generalization (marked
+red).
+
+Notably, it will be ‘organic’ to obtain more examples of certain questions (tasks)
+in a given domain. This should also encourage models to learn a certain skill in
+the domains where they have more training examples. Put plainly, it is better to
+learn checkbox detection on contracts than on invoices, which rarely contain any.
+This MDLT framework allows us to create a lasting, challenging benchmark
+that can be easily extended in the future with more tasks (formulated as QA
+pairs) and domains (relating to document types). In the first iteration of the
+DUDE competition, we have targeted specific skills by guiding annotators with
+focused instructions, which we share for future extensions.
+
+5.4.4
+
+DUDE Competition Protocol
+
+The ICDAR 2023 competition on Document UnderstanDing of Everything took
+place from February to May of 2023. A training-validation set with 30k QA
+annotations on 3.7k documents was given to participants at the beginning of
+February. The 11.4k questions on 12.1k documents for the test set were only
+made accessible for a window between March and May. Participants were
+asked to submit results obtained on the public, blind test set documents rather
+than deliver model executables, although they were encouraged to open-source
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0164.txt b/assets/txts/pg_0164.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98f11a3be5210800a1dbd5411ae6d0739ae58c5d
--- /dev/null
+++ b/assets/txts/pg_0164.txt
@@ -0,0 +1,49 @@
+132
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+their implementations. We relied on the scientific integrity of the participants
+to adhere to the competition’s guidelines specified on The Robust Reading
+Competition (RRC) portal4 .
+5.4.4.1
+
+Task Formulation
+
+Given an input consisting of a PDF with multiple pages and a natural language
+question, the objective is to provide a natural language answer together with
+an assessment of the answer confidence (a float value scaled between 0 and 1).
+Each unique document is annotated with multiple questions of different types,
+including extractive, abstractive, list, and non-answerable. Annotated QA
+pairs are not restricted to the answer being explicitly present in the document.
+Instead, any question on aspect, form, or visual/layout appearance relative to
+the document under review is allowed.
+Additionally, competitors were allowed to submit results for only a specific answer
+type (provided in annotations) such that, for example for extractive questions,
+encoder-only architectures could compete in DUDE. Another important subtask
+is to obtain a calibrated and selective DocVQA system, which lowers answer
+confidence when unsure about its answers and does not hallucinate in case
+of non-answerable questions. Regardless of the number of answers (zero in
+the case of non-answerable or multiple in list-questions), we expect a single
+confidence estimate for the whole answer to guarantee consistency in calibration
+evaluation. To promote fair competition, we provided for each document three
+OCR versions obtained from one open-source (Tesseract) and two commercial
+engines (Azure, AWS).
+5.4.4.2
+
+Evaluation Protocol
+
+The first evaluation phase assumes only independently and identically distributed
+(i.i.d.) data containing a similar mixture of document and question-answer
+types for the train-validation-test splits. The same evaluation metrics as the
+benchmark apply for this phase.
+The (implicit) second evaluation phase created a mixture of seen and unseen
+domain test data. This was launched jointly with the first evaluation phase, as
+otherwise, one would be able to already detect the novel unseen domain test
+samples. To score how gracefully a system deals with unseen domain data, the
+evaluation metric is AUROC [270], which roughly corresponds to the probability
+that a positive example (in-domain) is assigned a higher detection score than
+4 https://rrc.cvc.uab.es/?ch=23
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0165.txt b/assets/txts/pg_0165.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f3d1c19d6a38cf0ecc95b8f2500dea8e2914bc45
--- /dev/null
+++ b/assets/txts/pg_0165.txt
@@ -0,0 +1,45 @@
+DUDE BENCHMARK
+
+133
+
+a negative example (out-of-domain). A system is expected to either lower its
+confidence or abstain from giving an answer.
+There is a strict difference between a non-answerable question and an unseen
+domain question. For the former, the document is from a domain that was
+included during training, yet the question cannot be solved with the document
+content, e.g., asking about who signed the document without any signatures
+present. For the latter, the question is apt for the document content, yet the
+document is from a domain that was not included during training and validation,
+which we would expect the system to pick up on.
+All metric implementations and evaluation scripts are made available as a
+standalone repository to allow participants to evaluate close to official blind
+test evaluations5 .
+All submitted predictions are automatically evaluated, and the competition site
+provides ranking tables and visualization tools newly adapted to PDF inputs to
+examine the results. After the formal competition period, it will serve as an
+open archive of results. The main competition winner will be decided based on
+the aggregate high scores for ANLS, AURC, and AUROC.
+
+5.5
+5.5.1
+
+DUDE Benchmark
+Baselines
+
+Human performance. To establish the human baseline, we assign test set
+questions to Qualified Linguists, ensuring none of them will face the same
+documents as reviewed in Phase 4. The procedure results in an estimation of
+74.76 ANLS points (Table 5.3). At first glance, this result seems low. Still, when
+analyzing results case by case, it turns out that it’s hard to score much better
+since the answer format can influence the overall results a lot: Eagle vs. an eagle
+(0.625 ANLS), 62% vs. 62 (0.67 ANLS), 1958-04-29 vs. 4-29-58 (0 ANLS),
+Clemson University, Clemson South Carolina vs. Clemson University (0 ANLS).
+We achieved the lowest performance (67.58) on the extractive question type,
+which confirms our hypothesis since the abstractive answers are shorter (mostly
+numbers, yes/no, or colors).
+We analyzed the maximum score achieved by the best-performing model for
+each diagnostic test category and plotted that against the human performance
+in Figure 5.7.
+5 https://github.com/Jordy-VL/DUDEeval
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0166.txt b/assets/txts/pg_0166.txt
new file mode 100644
index 0000000000000000000000000000000000000000..07c3ff24cbd24c229f9a34812b23c5c98a5f5ad0
--- /dev/null
+++ b/assets/txts/pg_0166.txt
@@ -0,0 +1,38 @@
+134
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+Figure 5.7. We report the average ANLS for the human expert vs. the best-performing
+model per diagnostic category as a ceiling analysis.
+
+Reference models. We assessed a group of models to determine how their
+performance is influenced by different factors such as (1) their ability to handle
+textual, layout, and visual elements, (2) whether they were fine-tuned for the
+task, (3) their size in (trainable parameters), and (4) the maximum input length
+they can handle.
+To analyze factors (1) and (2), we conducted a zero-shot evaluation of several
+baseline text-only models. We used three encoder-based models (BERT [94],
+Longformer [28], and BigBird [521]) that cannot generate text and three that
+feature a decoder (T5 [383], GPT-3-Davinci [52], and ChatGPT) and have this
+capability. Next, we extended the T5 architecture with 2D layout embeddings
+[47, 371] and fine-tuned models with increasing maximum sequence lengths (512
+→ 8192) on DUDE. Finally, we evaluated our replication of the hierarchical
+Hi-VT5 model [451], as this model has the ability to decode text, understand
+multipage layouts, and comprehend visual page features using DiT [259].
+Regarding factors (2) and (3), we evaluated models of various sizes ranging
+from 131M (BigBird) to 175B (GPT-3-Davinci) and varied the input context
+from 512 (BERT) to 20480 (Hi-VT5) tokens. Overall, we thoroughly evaluated
+multiple models in the different testing setups to determine their performance
+under various conditions, as seen in Table 5.3.
+
+5.5.2
+
+Analysis & Discussion
+
+To summarize, our study reveals that existing advanced language models such as
+BERT, Longformer, and BigBird struggle with comprehending visual elements
+and document layouts. To address this issue, we introduced T5, T5-2D, and
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0167.txt b/assets/txts/pg_0167.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e3190171313f2218f06d6978ba6d3f01158ec512
--- /dev/null
+++ b/assets/txts/pg_0167.txt
@@ -0,0 +1,335 @@
+DUDE BENCHMARK
+
+Model
+
+Init.
+
+135
+
+Params
+
+Max Seq.
+Length
+
+Test
+Setup
+
+ANLSall ↑
+
+ECEall ↓
+
+AURCall ↓
+
+ANLSdo
+
+ANLSdo
+Abs
+
+ANLSdo
+Ex
+
+ANLSdo
+NA
+
+ANLSdo
+Li
+
+131M
+334M
+148M
+
+4096
+512
+4096
+
+Concat*
+Max Conf.*
+Concat*
+
+26.27
+25.48
+27.14
+
+30.14
+34.06
+27.59
+
+44.22
+48.60
+44.59
+
+30.67
+32.18
+33.45
+
+7.11
+7.28
+8.55
+
+40.26
+42.23
+43.58
+
+12.75
+5.88
+10.78
+
+8.46
+11.13
+10.62
+
+512
+512
+512
+8192
+
+Concat-0*
+Max Conf.*
+Concat+FT
+Concat+FT
+
+19.65
+29.48
+37.41
+41.80
+
+19.14
+27.18
+10.82
+17.33
+
+48.83
+43.06
+41.09
+49.53
+
+25.62
+37.56
+40.61
+44.95
+
+5.24
+21.19
+42.61
+47.62
+
+33.91
+44.22
+48.20
+50.49
+
+0
+0
+53.92
+63.72
+
+7.31
+10.56
+16.87
+7.56
+
+Concat-0
+Concat-4
+Concat-0
+Concat-4
+
+-
+
+-
+
+-
+
+35.07
+41.89
+43.95
+47.04
+
+16.73
+22.19
+18.16
+22.37
+
+42.52
+49.90
+54.44
+57.09
+
+70.59
+77.45
+73.53
+63.73
+
+15.97
+17.74
+36.32
+40.01
+
+3.49
+8.02
+5.43
+
+text-only Encoder-based models
+Big Bird
+BERT-Large
+Longformer
+
+MPDocVQA
+MPDocVQA
+MPDocVQA
+
+text-only Encoder-Decoder based models
+T5
+T5
+T5
+T5
+
+base
+MPDocVQA
+base
+base
+
+223M
+223M
+223M
+223M
+
+text-only Large Language models (LLM)
+ChatGPT
+GPT3
+
+gpt-3.5-turbo
+
+20B
+
+4096
+
+davinci3
+
+175B
+
+4000
+
+text+layout Encoder-Decoder based models
+T5-2D
+T5-2D
+T5-2D
+
+base
+base
+large
+
+223M
+223M
+770M
+
+512
+8192
+8192
+
+Concat+FT
+Concat+FT
+Concat+FT
+
+37.10
+42.10
+46.06
+
+10.85
+17.00
+14.40
+
+41.46
+48.83
+35.70
+
+40.50
+45.73
+48.14
+
+42.48
+48.37
+50.81
+
+48.62
+52.29
+55.65
+
+52.94
+63.72
+68.62
+
+316M
+125M
+
+20480
+512
+
+Hierarchical+FT
+Max Conf.*
+
+23.06
+20.31
+
+11.91
+34.97
+
+54.35
+47.51
+
+22.33
+25.27
+
+33.94
+8.10
+
+17.60
+32.60
+
+61.76
+8.82
+
+6.83
+7.82
+
+74.76
+
+81.95
+
+67.58
+
+83.33
+
+67.74
+
+text+layout+vision models
+HiVT5
+LayoutLMv3
+Human baseline
+
+MPDocVQA
+
+Table 5.3. Summary of Baseline performance on the DUDE test set (all ) and
+diagnostic subset (do ). Test setups are defined as Max Conf.: predict one answer per
+page and return an answer with the highest probability over all pages, Concat: predict
+on tokens truncated to maximum sequence length, FT stands for fine-tuning on
+DUDE training data, and -0 refers to zero-shot and -4 few-shot inference. Average
+ANLS results per question type are abbreviated as (Abs)tractive, (Ex)tractive,
+(N)ot-(A)nswerable, (Li)st. (*) We report only results for best performing test setup
+(either Max Conf. or Concat). All scalars are scaled between 0 and 100 for readability.
+
+Hi-VT5 models that incorporate layout and visual information. Still, their
+performance remains unsatisfactory, as evidenced by the comparison with the
+human baseline, similar to what has been reported for InfographicsVQA. This
+indicates that there is still scope for enhancing the visual understanding of
+DUDE models. Moreover, our findings indicate that a large LLM capable of
+processing long inputs alone is insufficient for achieving strong performance
+in DUDE, especially for the extractive type of answer. Finally, the dataset’s
+length significantly affects the models’ scores, as seen by the increase in scores
+by 4.4 − 5.0 points when the T5 and T5+2D context length is extended from
+512 to 8192. Similarly, the model size has a positive correlation with the final
+score, but it holds only within a particular model-type and is not the main
+factor influencing the results. State-of-the-art performance of 46.04 ANLSall
+was achieved on T 5large with a 2D layout understanding that consumed 8192
+tokens, confirming the observation above.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0168.txt b/assets/txts/pg_0168.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3a6a0a2bf0566591acfc391f10fbfa9db58732c9
--- /dev/null
+++ b/assets/txts/pg_0168.txt
@@ -0,0 +1,54 @@
+136
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+5.6
+
+Detailed Results Analysis
+
+5.6.1
+
+Within Model Class Analysis
+
+5.6.1.1
+
+Encoder vs. Decoder
+
+)
+
+A key difference between encoder-only and (encoder-) decoder-based models is
+the ability to generate answers beyond the explicit document textual content.
+This is clearly reflected in the results for BigBird, Longformer, BERT, and
+LayoutLMv3, which score < 10 ANLS% on abstractive questions, whereas they
+have just average scores for extractive questions. On DUDE, we can claim
+that a generative model is necessary given all considered question types.
+Quite remarkably, while the human baseline demonstrates that humans find
+abstractive questions (ANLS ±82%) easier than extractive questions (ANLS
+±68%), the reverse is true for all machine baselines. A potential confounder
+for these results could be the difference in output formatting for extractive vs.
+abstractive answers, which is hard to take into account with ANLS evaluation.
+5.6.1.2
+
+Incorporating Layout & Vision
+
+When comparing T5 with and without 2D position embeddings on the diagnostic
+categories, we find the highest improvements on ‘evidence table or list’,
+‘complexity simple’, and ‘evidence plain’.
+Our study with the proposed baselines shows that questions requiring visual
+evidence to be answered are an important future challenge for the vision
+community. To get further insights into models’ performance on these questions,
+we calculate a weighted average of ANLS over visual categories. This reveals
+that GPT3 (4-shot) and T5-2d-large-8K obtain a tied score of (ANLS=37%),
+even though they only have access to the text. The human performance, on the
+other hand, is close to double (ANLS=72%), thus showing the need for better
+integration of the visual modality in DU models.
+5.6.1.3
+
+Toward Long Document Processing
+
+DUDE clearly requires methods that can process long sequences, as evidenced
+by its average document length of 1832 ± 2545 tokens. This is particularly
+evident when comparing standard NLP QA methods like BERT-concat, which
+underperforms Longformer [28] and BigBird [521], despite being the large version.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0169.txt b/assets/txts/pg_0169.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1342d5feda87554165fb2275099307d2f4e3a3b0
--- /dev/null
+++ b/assets/txts/pg_0169.txt
@@ -0,0 +1,46 @@
+DETAILED RESULTS ANALYSIS
+
+137
+
+Experiments with T5 and T5-2D further support this claim, as extending the
+sequence length from 512 to 8192 leads to a ∼ 5% ANLS improvement.
+The exception is HiVT5 [451], which performs worse than the rest of the
+methods. This is due to the authors of HiVT5 performing a pretraining task of
+text denoising that helped to better model the [PAGE] tokens. This resulted
+in a better, compressed representation of the relevant information within a
+document conditioned by a question. Moreover, the authors also did extensive
+experimentation and found that 10 [PAGE] tokens per page were the best fit for
+the MP-DocVQA [451] dataset. We used similar hyperparameters, but DUDE
+might require better fine-tuning of [PAGE] tokens since the images are more
+visually rich with colored graphics and layouts. The hierarchical processing of
+documents with a meaningful visual component is a promising avenue for future
+research.
+5.6.1.4
+
+Diagnosis of LLM Results
+
+The reasoning for including these LLMs as baselines stems from our question:
+“Does advanced text understanding suffice for solving DUDE?". Our results
+for diagnostic categories reveal some strengths and weaknesses of LLMs in the
+DocVQA task setting.
+Strengths GPT3 trumps all other tested models for list-type questions
+(ANLS=36-40%), which can be explained by the extractive nature of these
+questions. After 4-shot fine-tuning, ChatGPT (4-shot) is better than all other
+tested baselines in answering not-answerable questions (ANLS=77.45%). This
+can partly explain the appeal of this particular GPT checkpoint in recent times.
+GPT3 (4-shot) outperforms (ANLS=52.51%) other tested baselines on questions
+from the ‘complexity multi-hop’ category such as What city name appears the
+most often in the timetables?.
+Weaknesses Compared to another (more simple text-only generative baseline,
+T5-base-512 (ANLS=47%), LLMs perform two times worse on abstractive
+questions (ANLS=22%). Closer analysis reveals that LLMs (even after 4-shot
+fine-tuning) predict abstractive questions to be Not-answerable in 55% of cases
+(in reality: 10%). Operations such as arithmetic, counting, and comparisons
+remain generally elusive skills (<25%ANLS).
+Both LLMs we tested scored significantly lower than the human baseline in
+questions that require visual understanding, with an average ANLS score of
+21%. This is understandable because these are text-only models.
+While LLMs’ zero-shot performance is relatively high, we note that DUDE
+consists of public-license documents from the web, which potentially might have
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0170.txt b/assets/txts/pg_0170.txt
new file mode 100644
index 0000000000000000000000000000000000000000..03b018edca6345329b09f4f5bd46be0bcaf04acf
--- /dev/null
+++ b/assets/txts/pg_0170.txt
@@ -0,0 +1,49 @@
+138
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+been included in the LLMs’ pretraining corpus.
+
+5.6.2
+
+Assessing Confidence
+
+ECE measures calibration of confidence, whereas AURC assesses both
+performance and confidence ranking [193] (more detail Section 2.2.3). The
+latter results in an appropriate metric to select the best model in real-world
+applications, where wrong predictions can yield undesired scenarios, which could
+be prevented by manually revising low-confidence answers.
+Interestingly, T5-base-512 scores better on calibration (ECE=10.82) than T5-2Dlarge-8K, the baseline with the highest ANLS, yet worse calibration (ECE=14.4).
+In general, it seems calibration worsens when extending the maximum sequence
+length, whereas adding 2D position embeddings only positively affects ANLS.
+From the baselines tested, T5-2D-large-8K achieves the highest AURC.
+Another interesting result comes from analyzing the calibration of models
+evaluated using the Concat strategy vs. Max Conf. strategy. In the main paper,
+we reported results for the model with the relative best ANLS. Thanks to our
+varied set of evaluation metrics, we discover that Max Conf. overall results in
+poor calibration (see Table 5.4), whereas considering ANLS, there is not always
+a clear winning strategy. This shows that predicting each page separately and
+necessarily assuming conditional independence across pages is not a reliable
+strategy for multipage DocVQA.
+
+5.7
+5.7.1
+
+DUDE Competition Results
+Submitted Methods
+
+Overall, 6 methods from 3 different participants were submitted for the proposed
+tasks in the DUDE competition. To avoid cherry-picking from considering all
+submissions of individual participants, we consider only the last submission
+(accentuated) for the final ranking. All the methods followed an encoder-decoder
+architecture, which is a standard choice for VQA when abstractive questions are
+involved. Specifically, the submitted methods are mostly based on T5-base [383]
+as the decoder. For this reason, we include the T5-base baseline to compare how
+the participant methods improved on it. A short description of each method
+can be found in Table 5.5.
+Two very recent state-of-the-art architectures, UDOP and HiVT5, have been
+extensively leveraged by participants. The former is geared toward improved
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0171.txt b/assets/txts/pg_0171.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bfeabc09b2f62326a8b4bbcaacdefa4b53a48eba
--- /dev/null
+++ b/assets/txts/pg_0171.txt
@@ -0,0 +1,110 @@
+DUDE COMPETITION RESULTS
+
+139
+
+Model
+
+ANLS
+
+ECE
+
+AURC
+
+BertQA MPDocVQA Concat
+BertQA MPDocVQA MaxConf
+
+29.8
+32.18
+
+13.83
+28.93
+
+43.28
+48.73
+
+BigBird MPDocVQA Concat
+BigBird MPDocVQA MaxConf
+
+30.67
+29.38
+
+25.07
+50.79
+
+47.2
+56.81
+
+LayoutLMv3 MPDocVQA Concat
+LayoutLMv3 MPDocVQA MaxConf
+
+22.61
+25.27
+
+13.19
+31.31
+
+57.11
+58.54
+
+Longformer MPDocVQA Concat
+Longformer MPDocVQA MaxConf
+
+33.45
+28.67
+
+22.21
+48.6
+
+45.83
+58.11
+
+T5 MPDocVQA Concat
+T5 MPDocVQA MaxConf
+
+34.37
+37.56
+
+18.97
+23.73
+
+47.31
+46.69
+
+T5-base Concat-0
+T5-base MaxConf-0
+
+25.62
+22.21
+
+20.05
+39.47
+
+62.25
+58.89
+
+Table 5.4. Comparison of baselines using Concat or Max Conf strategies.
+
+document page representations, while the latter targets multipage document
+representations. In their method reports, the UDOP-based models by Lenovo
+Research mention calculating confidence by multiplying the maximum softmax
+score of decoded output tokens with two additional post-processing rules: a)
+predicted not-answerable questions confidence is set to 1, b) when abstaining,
+confidence is set to 0.
+
+5.7.2
+
+Performance Analysis
+
+Table 5.6 reports the competition results ranking comparing the submitted
+methods’ performance on the test set. Higher ANLS and AUROC values
+indicate better performance, while lower ECE and AURC values signify
+improved calibration and confidence ranking. According to the findings, the
+UDOP+BLIP2+GPT approach attains the highest ANLS score (50.02), achieving
+the best calibration and OOD (out-of-distribution) detection performance. In a
+direct comparison of the MMT5 and HiVT5+modules methods, the former shows
+a higher ANLS score, yet did not provide any confidence estimates.
+Thus, the overall winner is UDOP+BLIP2+GPT by Lenovo Research. Their
+submitted methods (ranked by highest ANLS) also differentiate themselves by
+their additional attention to confidence estimation. Based on the numbers in
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0172.txt b/assets/txts/pg_0172.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a5446cfd49c077afb1a0a5c04a1c74e6eab5c9
--- /dev/null
+++ b/assets/txts/pg_0172.txt
@@ -0,0 +1,68 @@
+140
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+Method
+
+Description
+
+T5-base
+(ours)
+
+T5-base [383] fine-tuned on DUDE (AWS OCR), with a delimiter
+combining list answers into a single string, and replacing notanswerable questions with ’none’.
+
+)
+
+Lenovo Research
+UDOP(M)
+Ensemble (M=10) of UDOP [443] (794M each) models without
+self-supervised pretraining, only fine-tuned in two stages: 1) SPDocVQA [450] and MP-DocVQA [451], and 2) DUDE (switching
+between Azure and AWS OCR).
+UDOP
++BLIP2
+
+UDOP(M=1) with integrated BLIP2 [260] predictions to optimize
+the image encoder and additional page number features.
+
+UDOP
++BLIP2+
+GPT
+
+UDOP(M=1) and BLIP2 visual encoder with ChatGPT to
+generate Python-like modular programs to decompose questions
+for improved predictions [160, 437].
+
+Upstage AI
+MMT5
+
+Infrrd.AI
+HiVT5
+
+HiVT5
++modules
+
+Multimodal T5 pretrained in two stages: single-page (ScienceQA
+[403], VQAonBD2023 [385], HotpotQA [508], SP-DocVQA) with
+objectives (masked language modeling (MLM) and next sentence
+prediction (NSP)), multipage (MP-DocVQA and DUDE) with
+three objectives (MLM, NSP, page order matching). Fine-tuning
+on DUDE with answers per page combined for final output.
+Hi-VT5 [451] with 20 <PAGE> tokens pretrained with private
+document collection (no information provided) using span masking
+objective [204]. Fine-tuned with MP-DocVQA and DUDE.
+Hi-VT5 extended with token/object embeddings for a variety
+of modular document understanding subtasks (detection: table
+structure, signatures, logo, stamp, checkbox; KIE: generic named
+entities; classification: font style).
+
+Table 5.5. Short descriptions of the methods participating in the DUDE competition,
+in order of submission. The last submitted method is considered for the final ranking.
+
+the table, several interesting observations can be made to support the suggested
+future directions and propose additional experiments:
+• ANLS. The integration of UDOP, BLIP2, and ChatGPT contributes to the
+method’s superior overall performance in answering different question
+types.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0173.txt b/assets/txts/pg_0173.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e25724c2e19ddb4d937dc38656fe5300a72396c8
--- /dev/null
+++ b/assets/txts/pg_0173.txt
@@ -0,0 +1,99 @@
+DUDE COMPETITION RESULTS
+
+Answer
+Method
+UDOP+BLIP+GPT
+MMT5
+HiVT5+modules
+
+141
+
+Calibration
+
+ANLS / answer type
+
+OOD Detection
+
+ANLS ↑
+
+ECE ↓
+
+AURC ↓
+
+AUROC ↑
+
+Ex
+
+Abs
+
+Li
+
+NA
+
+50.02
+37.90
+35.59
+
+22.40
+59.31
+28.03
+
+42.10
+59.31
+46.03
+
+87.44
+50.00
+51.24
+
+51.86
+41.55
+30.95
+
+48.32
+40.24
+35.15
+
+28.22
+20.21
+11.76
+
+62.04
+34.67
+52.50
+
+Table 5.6. Summary of Method performance on the DUDE test set. Average ANLS
+results per question/answer type are abbreviated as (Abs)tractive, (Ex)tractive,
+(N)ot-(A)nswerable, (Li)st. (*) All scalars are scaled between 0 and 100 for readability.
+
+• ECE, AURC. Integrating UDOP, BLIP2 visual encoder, and ChatGPT
+for question decomposition contributes to the method’s performance in
+handling uncertainty across various question types.
+• Abstractive. The top performance of UDOP+BLIP2+GPT in abstractive
+questions reveals the potential of combining the UDOP ensemble, BLIP2
+visual encoder, and ChatGPT to enable abstract reasoning and synthesis
+of information beyond simple extraction.
+• List. The performance of UDOP+BLIP2+GPT in list-based questions
+suggests that incorporating page number features can enhance the model’s
+capability to process and generate list information, which might be spread
+across pages.
+Figure 5.8 visualizes an overview of the performance of each submitted method
+respective to diagnostic subset samples matching a certain diagnostic category.
+The models generally struggle with operations involving counting, arithmetic,
+normalization, and comparisons. As expected, models have higher performance
+when dealing with simpler questions (complexity simple) compared to more
+complex questions (complexity multi-hop, complexity other hard, and complexity
+meta). Models tend to perform better when handling evidence in the form of
+plain text (evidence plain) compared to other forms of evidence, such as visual
+charts, maps, or signatures. Performance across models is notably lower for
+tasks involving lists compared to other question types. Models show varying
+performance when dealing with different types of forms (e.g., date, numeric,
+other, proper).
+Figure 5.10 studies the ability of the competitors’ methods to answer questions
+respective to increasingly longer documents. We observe a significant drop
+in ANLS when aggregating scores over gradually longer documents. This is
+expected as the longer the document is, the more probable that the answer will
+either be located on a later page or rely on a long-range dependency between
+the tokens (e.g., a multi-hop question). Strikingly, all methods’ scores, except
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0174.txt b/assets/txts/pg_0174.txt
new file mode 100644
index 0000000000000000000000000000000000000000..621a5307ed350b1dc5a0502522daad0c6226f84c
--- /dev/null
+++ b/assets/txts/pg_0174.txt
@@ -0,0 +1,17 @@
+142
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+Figure 5.8. We report the average ANLS per diagnostic category for each of the
+submitted methods vs. human and a baseline method T5-base. Since the diagnostic
+dataset contains a different number of samples per diagnostic category, we added error
+bars representing 95% confidence intervals. This helps visually determine statistically
+significant differences.
+
+Hi-VT5+modules, drop significantly for questions on 2-page documents. This is
+likely to have the root cause in the standard input size of T5-based methods
+equal to 512 tokens, covering roughly 1 page.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0175.txt b/assets/txts/pg_0175.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c28d5244f0d7619f8f5aff6de0ab6f39a895c4a3
--- /dev/null
+++ b/assets/txts/pg_0175.txt
@@ -0,0 +1,21 @@
+DUDE COMPETITION RESULTS
+
+143
+
+Figure 5.9. A histogram (bins=8, matching ANLS-threshold of 0.5) of the average
+ANLS rate per QA pair when summing ANLS scores over competitor methods.
+
+Figure 5.10. Left: A histogram over the number of questions relative to the number of
+pages in the document (limited to 20 pages). Right: A line plot of the average ANLS
+score per QA pair: – documents of length at least (x-axis) pages.
+
+Figure 5.9 analyzes the correlation of errors over competitor methods. A large
+portion of QA pairs is predicted completely wrong (ANLS-rate = 0) by all
+competitor methods. This can have many plausible causes: a) by all sharing
+a similar decoder (T5), methods suffer from similar deficiencies, b) some QA
+pairs are too complex for current SOTA competitor methods, particularly
+questions requiring more complex reasoning or unique document-specific layout
+processing. To further analyze this phenomenon, we sample qualitative examples
+with different ANLS rates (Appendix B.1).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0176.txt b/assets/txts/pg_0176.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8d940e92588d27690bd174b516b049ffafd860c3
--- /dev/null
+++ b/assets/txts/pg_0176.txt
@@ -0,0 +1,46 @@
+144
+
+5.8
+
+DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE
+
+)
+
+Chapter Conclusion
+
+In conclusion, this chapter introduces a new large-scale multipaged, multidomain, multi-industry Document Visual Question Answering Benchmark for
+document understanding. Our dataset is adjusted to the real-world environment
+where we need to process long documents and understand different types of
+documents. The benchmark includes visual semantics such as tables, charts,
+figures, lists, checkboxes, stamps, and more, which are essential for real-world
+document understanding. The performance of SOTA textual and multimodal
+models still lags behind human performance, indicating the need for further
+improvement in visual understanding for DU models. Nevertheless, we believe
+evaluating systems on DUDE could inspire new architectures and methods.
+Limitations. As our approach is closer to real-world industrial applications,
+and enables models to recognize and understand new unseen data without
+the need for re-training, it does come with some limitations and constraining
+factors, including the use of only English language documents. Future work
+could address these limitations and expand the benchmark to include other
+languages. Moreover, although our dataset can be considered large-scale, it still
+represents a relatively small sample size of the plethora of documents that exist
+in the real world.
+As a core contribution of DUDE, we wanted to emphasize the importance of
+evaluation beyond mere predictive performance. DUDE offers an interesting
+and varied test bed for the evaluation of novel calibration and selective QA
+approaches (e.g., [96, 273]). While this was not explicitly attempted in this
+iteration of the competition, we hope that future work will consider testing
+their methods against DUDE.
+Future of the Shared Task As the competition evolves, we hope that DUDE
+will serve as an essential platform for pushing the frontiers of research and
+driving innovation in the DU field. Currently, our competition focuses on
+English language documents, which means we miss out on the potential of
+incorporating multilingual data. An ideal extension for future iterations of the
+shared task would be to introduce multilingualism, which our framework can
+accommodate, provided that source documents are readily available. However,
+this would also require specifying language qualifications for annotation experts.
+Moreover, one could automate part of the data collection process and annotation
+process by allowing the best-performing competition system to validate the
+aptitude and complexity of human-proposed QA pairs.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0177.txt b/assets/txts/pg_0177.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ee6542242eacdb59b7f8540674d1ba69b87ff6d4
--- /dev/null
+++ b/assets/txts/pg_0177.txt
@@ -0,0 +1,3 @@
+145
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0178.txt b/assets/txts/pg_0178.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b7242c23756073c1867d11150d90fa5a6a3a2f0b
--- /dev/null
+++ b/assets/txts/pg_0178.txt
@@ -0,0 +1,30 @@
+146
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+Chapter 6
+
+DistilDoc: Knowledge
+Distillation for Visually-Rich
+Document Applications
+The contents of this chapter come from a publication under review at CVPR
+2024 [471]:
+Jordy Van Landeghem, Subhajit Maity, Ayan Banerjee, Matthew B Blaschko,
+Marie-Francine Moens, Josep Llados, and Sanket Biswas. DistilDoc: Knowledge
+Distillation for Visually-Rich Document Applications. In Proceedings of the
+IEEE/CVF Conference on Computer Vision and Pattern Recognition (under
+review), 2024
+This is an external collaboration with Subhajit Maity, Ayan Bannerjee, Josep
+Llados, and Sanket Biswas. The work was conceived during a research visit at
+the Computer Vision Center in Barcelona, Spain.
+Disclosing the work done by the authors other than supervisors:
+• Jordy Van Landeghem created the project’s scope, implemented and
+performed all DIC and downstream DocVQA experiments, including
+training DLA teacher models, connecting the DLA inference and
+evaluation, and wrote the manuscript with supplementary.
+• Subhajit Maity and Ayan Bannerjee built the DLA architectures and
+performed the DLA-KD experiments.
+• Sanket Biswas brought the team together and helped with related work
+and the introduction.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0179.txt b/assets/txts/pg_0179.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a81fff9b08d66fd906bf382452650faff647f9a5
--- /dev/null
+++ b/assets/txts/pg_0179.txt
@@ -0,0 +1,46 @@
+INTRODUCTION
+
+147
+
+This chapter focuses on efficiency via knowledge-distillation (KD) model
+compression for document understanding (DU) tasks. While DU research is
+dependent on increasingly sophisticated and cumbersome models, the field has
+neglected to study efficiency via model compression, referring to any technology
+transforming large and complex models into smaller streamlined models with
+similar performance [548]. Here, we design a KD experimentation methodology
+for more lean, performant models on DU tasks that are integral within larger
+task pipelines, specifically document image classification (DIC) and document
+layout analysis (DLA).
+We carefully selected KD strategies (response-based, feature-based) for distilling
+knowledge to and from backbones with different architectures (ResNet, ViT,
+DiT) and capacities (base-small-tiny). We study what affects the teacherstudent knowledge gap and find that some methods (tuned vanilla KD, MSE,
+SimKD with an apt projector) can consistently outperform supervised student
+training. Furthermore, we design a downstream task setup to evaluate the
+robustness of distilled DLA models on zero-shot layout-aware document visual
+question answering (DocVQA).
+DLA-KD experiments result in a large mean average precision (mAP) knowledge
+gap, which unpredictably translates to downstream robustness, accentuating
+the need to further explore how to efficiently obtain more semantic document
+layout awareness.
+This chapter motivates the need for more efficient DU models, especially for
+VRD tasks, and provides a benchmarking framework for future research on KD
+for DU tasks. Additionally, it motivates being smart about when to use which
+modality when the downstream task has a certain modality-bias (e.g., DocVQA
+is a text-centric task, whereas DLA is more vision-centric). Finally, it links to
+efforts in DUDE to use LLMs for DU, with the focus here on incorporating
+layout information from distilled DLA models into the LLMs.
+
+6.1
+
+Introduction
+
+Visually-rich Document Understanding (DU) has attracted increasing interest
+over the last few years. It involves multiple tasks such as document image
+classification (DIC) [165, 195, 210, 284], key information extraction (KIE) [197,
+272, 296, 422, 433], document layout analysis (DLA) [35, 36, 80, 362, 544] and
+document visual question answering (VQA) [100, 309, 310, 450]. Current SOTA
+DU models [153, 187] solve the task by using modern OCR engines to read the
+text and then combine them with spatial features to predict the page layout and
+structure. However, these multimodal architectures come with the following
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0180.txt b/assets/txts/pg_0180.txt
new file mode 100644
index 0000000000000000000000000000000000000000..86bb331e648b07690b7a1ba97f9f80d78a6eb19c
--- /dev/null
+++ b/assets/txts/pg_0180.txt
@@ -0,0 +1,112 @@
+148
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+OCR
+
+"CONTENTS", "IFMA
+Objectives", "Page 2",
+"IFMA Officers and
+Board of Directors", ...
+
+what are the
+contents in
+page 2?
+
+Text Tags
+
+Question Prompt
+
+Large
+Baseline
+Encoder
+
+OCR
+
+Ground-Truth
+
+Knowledge
+Distillation
+
+"IFMA Objectives"
+Text Tags + DLA Tags
+
+B
+DLA
+
+Layout Aware
+OCR
+
+"<Section Header>",
+"CONTENTS", "<Table>", "IFMA
+Objectives", "Page 2", "IFMA
+Officers and Board of Directors", ...
+
+DLA + OCR
+
+KD A
+
+Practical:
+Robust:
+
+LLM Decoder
+
+Small
+Student
+Encoder
+
+A
+
+...
+
+...
+
+Efficient:
+DocVQA
+
+KD B
+KD A , KD B
+
+...
+
+mAP
+params size
+GFLOPS
+
+KD A + DLA + OCR
+
+KD B + DLA + OCR
+
+im/s (throughput)
+ANLS
+Explorative Analysis
+
+Figure 6.1. DistilDoc presents the first framework to investigate the potential of
+KD-based DLA model compression to enrich LLM prompts with logical layout
+structure to practically and efficiently improve downstream applications such as
+DocVQA.
+
+drawbacks: 1) They rely primarily on Large Language Models (LLMs) [542]
+pretrained on millions of samples which depend more on OCR text quality than
+visual features/document structure; 2) can be computationally heavier due to
+the need to process and fuse information from different modalities; and 3) may
+perform poorly in domains with poor OCR results or on low-resource languages.
+Therefore, this work focuses on single-modality, vision-only architectures that
+can be finetuned for handling VRDs in tasks involving understanding visuallayout semantics such as tables, titles, paragraphs, figures, etc. DLA is a useful
+preliminary step in a document processing workflow [35, 80], holding the key
+to enhancing practical downstream DU tasks such as DIC, KIE, and VQA.
+DLA can impart logical layout structure, beyond geometric layout from OCR
+[164], and structured context to the document, to enable more accurate content
+extraction and interpretation. A recent DU competition [469] has pleaded to
+bridge the gap between DLA and DocVQA by introducing layout-navigating or
+multi-region questions.
+To handle the computational demand of modality/task-specific models,
+knowledge distillation (KD) [21, 150, 178, 394] can prove an effective approach
+to obtain efficient modules for later re-use in enriching LLM document inputs.
+Teacher model compression has the potential to make student models that
+improve over direct finetuning, also making them practical for deployment
+with resource-constrained devices or for faster real-time inference. The field of
+Document AI [79] is engaged with representing and understanding VRDs, but
+thus far has not explored KD-based model compression for improved efficiency
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0181.txt b/assets/txts/pg_0181.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba6dd475cbfba13a5d1a8a74d2ec3fe0e05458b0
--- /dev/null
+++ b/assets/txts/pg_0181.txt
@@ -0,0 +1,43 @@
+RELATED WORK
+
+149
+
+and uncertainty estimation [126].
+This work investigates the potential of enriching VRDs with logical layout
+structure derived from effective DLA model compression using KD methods to
+practically and efficiently improve downstream DU applications. The nature
+of the (document) dataset has a major impact on the KD process [434], which
+required motivated choices (regarding dataset usage [14, 165, 362], architectures,
+weight initialization [259], KD methods [63, 67, 170, 178, 183, 540], evaluation,
+downstream procedure [482], etc.) in designing our experimental methodology
+of KD benchmarking for DU tasks (DIC, DLA). This allows us to investigate
+aspects affecting teacher-student knowledge/capacity/initialization gaps.
+The key contributions of the paper are twofold:
+I. We are the first to design, apply, and open-source an experimental methodology
+for comprehensively benchmarking KD-based model compression on DU tasks
+involving VRDs (DIC and DLA).
+II. We design a novel evaluation procedure based on the downstream task of
+zero-shot layout-aware DocVQA to quantify the robustness of distilled DLA
+models.
+Nevertheless, our contributions go beyond mere KD-based compression
+benchmarking, promoting logical layout analysis over geometric layout to
+enhance the generalization of DU models toward unseen documents with diverse
+and complex layouts, as demonstrated in Figure 6.1.
+
+6.2
+
+Related Work
+
+Efficiency and Model Compression Efficiency through model compression
+is gaining relevance with the increasing parameter size and complexity of
+models such as LLMs [556]. Although KD is a prominent technique for model
+compression, several alternative approaches are worth mentioning. Quantization
+has been recently re-discovered in the context of LLMs with LoRA [184] and QLoRA [93] that achieves substantial model compression with minimal accuracy
+degradation. Advances have been made also in vision-and-language [57, 518]
+and more recently for vision transformer (ViT) training [269]. However, its
+effectiveness also depends on some key factors, including the model architecture,
+data type, bit-width, and the training recipes employed. In this direction, neural
+architecture search (NAS) became an important field of study [55, 279, 280, 363].
+Popular alternatives include model weight pruning [131, 288, 554] that benefits
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0182.txt b/assets/txts/pg_0182.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e5c951597058d2fedfbf11acd36b1d9fd49f9846
--- /dev/null
+++ b/assets/txts/pg_0182.txt
@@ -0,0 +1,43 @@
+150
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+strongly from joint usage with other efficiency and model compression techniques;
+adaptive inference with multi-exit architectures [501, 547], which are promising
+yet highly dependent on early exit network design and uncertainty estimation.
+KD-based training [364] complements the aforementioned techniques, leading
+to potentially more accurate model exits and pruning. Moreover, KD strategies
+involve overall simpler design choices, depending mostly on the availability of a
+large teacher model trained on domain data of interest. Therefore, we prioritize
+KD-based model compression and efficiency for practical DU applications.
+Knowledge Distillation KD strategies can be categorized into three main
+categories: response-based KD [6, 21, 178, 314, 509, 541] seeks to match the final
+layer predictions of the teacher model; feature-based KD [8, 62, 67, 175, 221, 394]
+aims to mimic features extracted from intermediate hidden layers of the deep
+network and relation-based KD [355, 356, 447, 511] which exploits the relations
+between different layers or sampled data points. However, the latter approach
+is more geared toward pixel-based semantic segmentation tasks. While featurebased KD is more versatile, it is more expensive and harder to implement
+than soft teacher predictions. While offline methods [178, 394] consider an
+existing frozen teacher model, online methods [61, 538] update both student
+and teacher networks jointly. Self-distillation [22, 528] represents a special case
+of online KD, which employs the same network as both the teacher and student,
+progressively outperforming the network’s performance, albeit disregarding the
+aim of efficiency.
+Our work’s scope will be offline KD schemes, with a single converged teacher
+(vs. intermediate checkpoints [479] or ensembles [515]), single modality inputs
+(vision only), with three different feature extraction backbones (ResNets, ViT
+and a self-supervised pretrained document foundation model DiT [259]). Our
+study seeks to extend the empirical utility of KD to popular DU tasks (DIC &
+DLA) with a versatile benchmarking framework to ensure future compatibility,
+fostering KD-based DU model compression research.
+Practical and Efficient Document Understanding Recent efforts to represent
+layout and document structure have gained substantial recognition, particularly
+with the incorporation of structural information into LLMs. The LayoutLM
+family [187, 502, 503] and GeoLayoutLM [296] laid the foundation of using 2D
+positional information of text (word blocks) tokens obtained from OCR as a
+geometric layout representation for the input. Recent work [416] has further
+enhanced this 2D representation by incorporating text lines or text blocks
+as layout groups inside the OCR text tokens. [482] further experiment with
+structure-preserving OCR, that uses appropriate spaces and line breaks as an
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0183.txt b/assets/txts/pg_0183.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b0a7e906b7cb71d7f5e4d0757892cd7908f88ea
--- /dev/null
+++ b/assets/txts/pg_0183.txt
@@ -0,0 +1,113 @@
+EXPERIMENTAL SETUP
+
+Pre-​trained checkpoint
+
+151
+
+Teacher pre-​training Teacher fine-​tuning
+
+Student fine-​tuning
+
+Student checkpoint
+
+Downstream evaluation
+
+optional
+ViT-​B
+
+ResNet-101
+
+Sup ImageNet-1K (1.3M)
+
+ViT-​S/T
+
+Sup
+
+RVL-​CDIP (400K)
+
+DiT-​B
+
+SS
+
+IIT-​CDIP (11M)
+
+Sup
+
+PubLayNet (360K)
+
+ResNet-50
+
+Tobacco-3482
+
+Tobacco-3482
+
+random
+
+RVL-​CDIP
+
+RVL-​CDIP_[1K]
+
+ViT-​S/T
+
+PRImA
+
+PRImA
+
+DocLayNet
+
+DocLayNet
+
+RVL-​CDIP-​N
+
+DocVQA
+InfographicsVQA
+Q: How many positive samples of Influenza
+A H1 pathogen were detected in DoD
+beneficiary?
+
+Sup ImageNet-1K (1.3M)
+
+KD methods
+Note
+
+Note
+
+Note
+
+Response-​based
+Feature-​based
+
+Figure 6.2. Proposed experimental methodology to comprehensively study all
+aspects (left-to-right) that impact KD methods (response, feature; projectors) adapted
+for VDU task specifics (architecture, weight initialization, pretraining & finetuning
+datasets, student capacity). Downstream setups evaluate the robustness of distilled
+students.
+
+LLM input, thereby improving the ability to capture layout and structural cues
+for zero-shot DocVQA [309, 310] tasks. [153, 263] seek to represent layout as
+region-level proposal features, representing logical layout elements like title,
+paragraph, figure, tables, etc.) as in the DLA task. To further study the utility of
+logical layout representations, [498] address asking questions conditioned inside
+a specific region of a page, improving upon the design of DocVQA that provides
+too many in-line questions (>80%). More recently, PDFTriage [400] generates a
+structured metadata representation of born-digital documents, extracting both
+geometric and logical layout elements like section text, figure captions, headers,
+and tables for a more precise QA approach. DUDE [468] offers a testing bed for
+DocVQA on multipage, multi-type documents with varying layouts, including
+questions conditioned on layout navigation, e.g., ‘Which pages have tables?’.
+Our explorations focus on making the most of the logical layout features obtained
+from the multi-domain DLA benchmark, DocLayNet [362]. We build upon
+the aforementioned advancements and explore how incorporating document
+structure can enhance the performance of downstream task models, aligning
+with the trend of enriching LLMs with rich-text prompting and layout-aware
+representations.
+
+6.3
+
+Experimental Setup
+
+This Section documents the experimental methodology established in this work
+as visualized in Figure 6.2, including datasets, architectures and backbones
+for teacher and student models, KD methods, and evaluation metrics for the
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0184.txt b/assets/txts/pg_0184.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a62d9d15a7b71612b93b6321ec96548be236f205
--- /dev/null
+++ b/assets/txts/pg_0184.txt
@@ -0,0 +1,91 @@
+152
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+tasks and distillation effectiveness. The goal is to provide a framework for
+future research on KD for DU tasks and allow pinpoint comparisons on KD
+aspects such as teacher-student knowledge and capacity gap, teacher-pretraining,
+student network initialization, etc.
+Table 6.1. Dataset usage for DIC, DLA, and downstream tasks. Symbols: P =
+pretraining, DP = document pretraining, T = teacher training, S = student training,
+* = subsampling, E = teacher/student evaluation, D: downstream evaluation
+
+Dataset
+ImageNet [90]
+IIT-CDIP [252]
+Tobacco-3482[232]
+RVL-CDIP[165]
+PRImA[14]
+DocLayNet[362]
+RVL-CDIP-N [241]
+SP-DocVQA [450]
+Infographic [310]
+
+6.3.1
+
+Task
+DIC
+DIC
+DIC
+DIC
+DLA
+DLA
+DIC
+VQA
+VQA
+
+Usage
+P
+DP,T,S
+T,S,E
+DP,T,E
+T,S,E
+T,S,E
+D
+D
+D
+
+Size
+1.28M
+11M
+3482
+400K
+400
+80.8K
+1K
+12.8K
+5.5K
+
+# Cls
+1000
+/
+10
+16
+6
+11
+12
+50K
+30K
+
+Datasets
+
+Tab. 6.1 lists all datasets used (in)directly for the experiments. As there is
+no existing methodology for KD experimentation on the tasks involved, we
+motivate the design choices:
+DIC We benchmark results on both Tobacco-3482 (original train-val-test splits
+800-200-2482) and RVL-CDIP. The originally large training size of RVL-CDIP
+hinders experimentation (long iteration cycles), which is why we create a
+subsampled student training set, RVL-CDIP1k , by randomly selecting 1K images
+per class. By evaluating the full RVL-CDIP test set, we provide a fair evaluation
+of the usefulness of KD methods, while avoiding the cumbersomeness of student
+finetuning on such a large dataset.
+While RVL-CDIP is the de facto standard for measuring performance on
+the task of document classification, the literature [242, 470] has reported
+several undesirable characteristics such as (near-)duplicates causing substantial
+overlap between train and test distributions. We complement independently
+and identically distributed (i.i.d.) test set evaluation with benchmarking on
+RVL-CDIP-N [241], which is a covariate shift dataset allowing us to evaluate
+the robustness of KD methods to domain shift, which is a common problem in
+real-world applications.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0185.txt b/assets/txts/pg_0185.txt
new file mode 100644
index 0000000000000000000000000000000000000000..caeb6bf1cf13e65f1d52ae46fde5ee122bf6ae73
--- /dev/null
+++ b/assets/txts/pg_0185.txt
@@ -0,0 +1,41 @@
+EXPERIMENTAL SETUP
+
+153
+
+DLA We benchmark results on DocLayNet (reporting evaluation on validation
+set following common practice) and PRImA. The former is a large-scale humanannotated dataset with 81K images and 11 categories of logical layout elements,
+while the latter is a smaller dataset with 400 images and 6 classes. DocLayNet
+contains a wide layout variability with six diverse document types (patents,
+scientific, legal, reports, tenders) in English. They have been hand-annotated
+by trained experts, making it the gold standard for DLA. Alternatively,
+Publaynet [544] or MS-COCO [274] benchmarks have been used in pretraining
+DLA models. However, the former lacks diversity as it only contains documents
+from the scientific domain while the latter is a more common object detection
+benchmark for natural scenes.
+We consider a mirrored data setup for both tasks, with one larger benchmark
+dataset (RVL-CDIP, DocLayNet) and a smaller, easier dataset (Tobacco-3482,
+PRImA). This allows us to compare KD efficacy with more or less accurate
+teachers over tasks.
+
+6.3.2
+
+Architectures and Backbones
+
+We evaluated three backbone architectures, representing different approaches
+to the tasks of DIC and DLA.
+Backbones Residual Network (ResNet) [167]: A supervised pretrained CNNbased architecture that is a staple in image recognition.
+Vision Transformer (ViT ) [101]: A supervised pretrained Transformer-based
+architecture that is effective for a variety of CV tasks.
+Document Image Transformer (DiT ) [259]: A self-supervised pretrained
+architecture specifically designed for DU tasks, as it was pretrained on 11M
+document images from IIT-CDIP with a Masked Image Modeling objective, as
+inspired by BeiT [24].
+Specific to DLA, we use the Mask R-CNN [168] meta-architecture for instance
+segmentation with two different backbones, i) classic ResNets and ii) ViT, with
+the latter more challenging to integrate [267].
+Historically, CNNs have been more popular for DLA due to their accuracy,
+speed, and multiple optimizations built into the meta-architectures (involving
+a backbone, neck, and head). However, recent work is pointing to the
+potential of ViT as plain (non-hierarchical) object detectors [268]. Compared
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0186.txt b/assets/txts/pg_0186.txt
new file mode 100644
index 0000000000000000000000000000000000000000..956410aa0607682ada75cf2fdb3741bf8b5945d2
--- /dev/null
+++ b/assets/txts/pg_0186.txt
@@ -0,0 +1,37 @@
+154
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+to Transformers, CNNs have strong inductive biases of translation equivariance
+and locality, a fundamental difference that is less explored in a KD context
+[33].
+
+Network Architecture and Initialization Document images are very different
+from natural images, yet most available vision backbones of different sizes are
+pretrained on the latter, except for DiT. Nevertheless, ViTs seem to struggle to
+learn a function when starting from random initialization, both as teachers and
+student networks. Therefore, we will use ImageNet pretrained checkpoints for
+all models considered, even for student network initialization.
+
+Teacher Models While there are many model variants with different capacities
+for each of the backbones (Tab. D.1), we opt for the Base variant for
+Transformers, which arguably is most common. We consider ResNet-101 as it
+has the attractive property of having similar hidden layers’ output dimensionality
+as the next smaller variant, ResNet-50.
+The comparison of ViT-B and DiT-B allows us to evaluate the effects of different
+pretraining schemes (supervised, self-supervised) and how this affects knowledge
+transfer.
+Student Models For DIC, we consider ViT-small and ViT-tiny, as well as a
+CNN-based architecture (ResNet-50), whereas, for DLA, we consider MaskRCNN with a Resnet-50 backbone and a ViT-tiny backbone. Due to the
+computational demand of training instance segmentation models, we only
+consider the ViT-tiny backbone for the student model, therefore not making
+it possible to analyze KD methods for an increasing teacher-student capacity
+gap. While it would have made an interesting comparison, DiT has not been
+released in a smaller variant than DiT-B, and given the computational demand of
+pretraining DiT on the entire IIT-CDIP dataset containing 42 million document
+images, we did not consider it for student training. One might regard the
+knowledge transfer of DiT-B to a smaller ViT-(S/T) as potentially resulting in
+DiT-(S/T), yet the ImageNet or random initialization of the student network
+differs substantially from that of the self-supervised DiT weight space.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0187.txt b/assets/txts/pg_0187.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d8ea714883c7560663d12dfe67dcacdf97acd338
--- /dev/null
+++ b/assets/txts/pg_0187.txt
@@ -0,0 +1,57 @@
+EXPERIMENTAL SETUP
+
+6.3.3
+
+155
+
+KD Methods
+
+The basic approach of knowledge distillation consists of transferring ’knowledge’
+from a cumbersome teacher model f t to a lightweight student model f s , where
+f : X → ∆Y is a function mapping input data X and outputting a conditional
+probability distribution P (y 0 |x) over output labels y 0 ∈ Y = [K] for K classes
+[368]. When this model compression approach is done effectively, the student
+model will be more efficient in terms of memory and computation. The top-1
+class prediction is ŷ = argmaxy0 ∈Y [f (X)]0y , with p̂ = maxy0 [f (X)]0y the posterior
+probability. For convenience, [f˜(x)]k denotes the k-th element of the logits
+
+vector f˜(x) ∈ RK , which when normalized with softmax f (x) = σ f˜(x) =
+exp(f˜(x)/τ )
+. Let each function f be parameterized by θ holding all
+PK
+˜
+k=1 exp([f (x)]k /τ )
+trainable parameters of the function, separable into a variable L layers, where
+fl (x) denotes the l-th layer output, e.g., the penultimate layer output fL−1 (x).
+While there exists a wealth of ever-growing KD methods, we have carefully
+chosen a combination of simplistic methods mimicking the basic principles
+of KD (i, iv), more advanced KD methods that target specific improvements
+such as penalizing the non-target class logits (ii), or distilling the knowledge of
+intermediate layers (iv), and methods that take a step back on established KD
+practices by optimizing mean squared error (MSE) between teacher-student
+logits or reusing the teacher classifier (ii, vi).
+Every method will be explained with loss functions, additional hyperparameters,
+and training parameters. (i) Vanilla KD [178] optimizes a linear combination
+of hard-target student cross-entropy (CE) loss and Kullback Leibler (KL)
+divergence loss with soft-target teacher predictions, including loss KD
+hyperparameters α ∈ [0, 1] and the temperature τ > 1, which gives more
+weight to student loss and controls the softness of teacher logits, respectively.
+
+LKD = α LCE (y, ŷ s ) +(1 − α) τ 2 LKL f t (x), f s (x)
+| {z }
+|
+{z
+}
+τ =1
+
+τ >1
+
+(ii) MSE loss between teacher-student logit vectors enables direct logit-level
+matching [217]
+2
+LMSE = f˜s (x) − f˜t (x) 2
+(iii) NKD Normalized KD loss [509] decouples vanilla KD into a normalized
+(indicated N ) combination of the target (c ∈ Y) loss and the non-target loss in
+CE form, where γ ∈ [0, 1] is a trade-off hyperparameter and τ the temperature.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0188.txt b/assets/txts/pg_0188.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9ce9771000b4b065b854e5374eda394a4b3b7484
--- /dev/null
+++ b/assets/txts/pg_0188.txt
@@ -0,0 +1,73 @@
+156
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+LNKD = [f t (x)]c [f˜s (x)]c −γ · τ 2 ·
+|
+{z
+}
+target
+
+K
+X
+
+N [f t (x)]τk
+
+
+
+N [f˜s (x)]τk
+
+
+
+k6=c
+
+|
+
+{z
+
+non-target
+
+}
+
+(iv) FitNet [394] enables feature-based KD by minimizing the Euclidean
+distance between the intermediate feature maps of the teacher and student
+networks (i.e., MSE loss). A trainable projector P(·) (e.g., a linear projection
+layer) is required if the dimensionality of the hint layer(s) h ∈ [1, L + 1] outputs
+does not correspond to that of the student, There are no hyperparameters,
+except for projector design and where to place hint layers in the teacher network.
+(v) ReviewKD [67] uses multi-stage information (multiple layers) of the teacher
+to supervise one student layer. The knowledge review mechanism is too complex
+to cover here as it involves multiple modules (residual learning, attention-based
+fusion projector, and a hierarchical context loss). This work claimed the first
+exploration of KD for instance segmentation, which is why we include it only
+for DLA.
+(vi) SimKD [63] is a hybrid KD method that combines the advantages of
+response-based and feature-based KD. On the one hand, it reuses the pretrained,
+s
+frozen teacher classifier for student inference (fLt (P(fL−1
+(x))), and on the
+other hand, it adopts MSE for feature alignment (following a projector) of the
+penultimate layer feature-representations. Note that the former classification
+output is not used for training or loss calculation, only the latter projected
+feature map alignment.
+ t
+
+s
+LSimKD = LMSE P fL−1
+(x) , fL−1
+(x)
+While the projector can safely be discarded for (iv,v) to obtain cost-free student
+inference, SimKD requires both the trained projector and teacher classifier
+to be used (and stored) for student inference. SimKD originally proposed a
+CNN-based projector between teacher and student feature maps (assuming
+C(hannels) x H(eight) x W (idth) inputs). For compatibility with ViT-based
+architectures, we contribute a novel variant of SimKD, which uses a linear
+projection layer on the [CLS] token at the penultimate layer. Alternatively, we
+draw upon [77, Theorem 1] that a multi-head self-attention layer can simulate a
+convolutional layer, subsequently reshaping the penultimate hidden layer output
+(ignoring [CLS] pooling) to (C x W x H), where C is the hidden size (e.g.,
+197(-1) for ViT-B), and W, H are equal to the number of patches (e.g., 14 for
+ViT-B with patch size 16 and image sizes 224x224), finally applying the original
+CNN projector to obtain the projected feature maps.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0189.txt b/assets/txts/pg_0189.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d17b75a0ce94e99ecc7f80852bd803704cf2aab3
--- /dev/null
+++ b/assets/txts/pg_0189.txt
@@ -0,0 +1,41 @@
+EXPERIMENTAL SETUP
+
+157
+
+Task considerations The number of KD methods considered between the tasks
+differs, as some methods were not designed for use in a meta-architecture like
+Mask R-CNN. Response-based methods using logits are not capable of providing
+knowledge for object localization (e.g., region proposal network head), making
+feature mimicking of vital importance. Moreover, the performance of instance
+segmentation highly depends on the quality of deep features to locate interested
+objects [509, 541], which is why we only consider feature-based KD methods
+for DLA (v, vi). When deciding upon KD methods to include, the literature
+reported ReviewKD as the feature-based SOTA, NKD as the response-based
+SOTA, and SimKD as the hybrid SOTA on image classification (CIFAR-100).
+
+6.3.4
+
+Evaluation
+
+Metrics Predictive performance evaluation for DIC follows standard practice
+with accuracy, whereas we forego the F1 score as the classes are balanced.
+For DLA, we use the standard metrics of mean average precision (mAP) @
+intersection over union (IOU) [0.50:0.95] of bounding boxes.
+Efficiency evaluation considers the combination of parameter size and FLOPS
+(floating point operations) to be representative enough to compare distilled
+models.
+Following calls in the DU literature [468] to establish calibration and
+confidence ranking as defaults to the evaluation methodology, we include
+Expected Calibration Error (ECE) [156, 332, 340] to evaluate top-1 prediction
+miscalibration and Area-Under-Risk-Coverage-Curve (AURC) [138, 193] to
+measure the error rate over selective (% of test set) accuracy (detailed in
+Section 2.2.3).
+
+Covariate shift DIC-KD evaluation To evaluate the robustness of distilled
+models, we consider evaluating the impact of domain shift on the downstream
+task of DIC. Luckily, there exists a dataset similar to RVL-CDIP in terms of
+document types and classes, yet different in terms of document sources and
+label distribution. This dataset is called RVL-CDIP-N [241], and we will use it
+to evaluate the robustness of distilled models.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0190.txt b/assets/txts/pg_0190.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c962fe1325d83916ac632dce15e3dcd29386134a
--- /dev/null
+++ b/assets/txts/pg_0190.txt
@@ -0,0 +1,46 @@
+158
+
+6.3.5
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+DLA-enriched LLM prompting
+
+Downstream DLA-KD evaluation An important objective of this work is to
+demonstrate the usefulness of DLA predictions in downstream VRD tasks. As
+SOTA DLA models are often as cumbersome (parameter size, GFLOPS) as the
+downstream models, this motivates the need for KD to obtain more efficient
+DLA predictors that could be used to enrich document inputs with logical
+layout information.
+While we focus on visual-only document inputs in benchmarking KD, we take
+the opportunity to benchmark DLA as part of a zero-shot DocVQA task setup
+with text-only LLMs [482], which can benefit from additional layout information
+when answering questions that appear in certain logical elements (’what is the
+first column header of Table 3’, ’what is the title of the document?’). Similarly,
+it could benefit to know what falls within an infographic picture or legend;
+which is why we benchmark on SP-DocVQA and InfographicVQA, with the
+latter containing more visually-rich information. As a model of choice, we
+have opted for Llama-2-7b-chat [452] with 4-bit quantization to keep GPU
+memory requirements to a minimum, while still performing sufficiently reliably.
+Evaluation is done using ANLS [39, 468] on predicted answers vs. ground
+truths.
+The prompt design follows [482] with a task instruction and placeholders for
+the question and the document input, the latter depending on the prompt
+parameterization (see Tab. 6.2). Possible values are plain, single-spaced OCR
+tokens, space, tokens placed heuristically with whitespaces in their approximate
+position, or DLA, which adds start and end tags such as <Table> and </Title>
+to indicate logical layout as predicted by a DLA model. A pseudo-algorithm
+(Sec. 6.3.5) details the procedure to generate DLA-enriched prompts.
+KIE is regarded as an important downstream DU task, yet we believe (as
+supported by [166]) that it would benefit less from DLA, due to most information
+being organized as key-value pairs with only local context relevance.
+
+6.4
+
+Results & Discussion
+
+DLA-KD This work investigates different SOTA KD methods and integrates
+them into the DLA framework with ResNet and ViT feature extraction
+backbones. KD in DLA poses significant challenges owing to the intricate
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0191.txt b/assets/txts/pg_0191.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3f6d8e03178f21a731941d9bcceaf77bac0fac02
--- /dev/null
+++ b/assets/txts/pg_0191.txt
@@ -0,0 +1,156 @@
+RESULTS & DISCUSSION
+
+159
+
+Algorithm 1: Construction of DLA-enriched prompts pDLA
+
+4
+
+Input: A finite set Dtest = {(x(i) , y(i) )}N
+i=1 of holdout data, consisting of document
+images x(i) and corresponding labels y(i)
+Output: Tokenized DLA-enriched prompts pDLA
+Parameters : ζiou : IoU-threshold for layout-token boxes (default: 0.3)
+Parameters : Ignore-labels: DLA labels to ignore for enrichment (default: {‘Text’})
+Input
+: A document image v
+Require: A trained DLA model and an OCR engine
+(1) Feed image to DLA model to obtain labeled layout boxes
+{(bj , cj , mj )}J
+// Boxes, classes, metadata
+j=1 ← DLA(v)
+Feed image to OCR engine to obtain tokens and boxes
+
+5
+
+u = {(wt )}T
+t=1 , s =
+
+1
+2
+3
+
+6
+7
+8
+9
+
+
+
+x1t , yt1 , x2t , yt2
+
+
+
+T
+t=1
+
+← OCR(v 0 )
+
+Standardize layout boxes to similar xy-format
+for j ← 1 to J do
+bj ← StandardizeBbox (bj )
+if OCR image dims 6= DLA image dims then
+
+// Tokens and token-boxes
+
+// Standardize to xy-format
+
+// Precomputed OCR (DUE) results can be reused, yet OCR images can have higher resolution
+
+10
+11
+
+Interpolate layout boxes to token-boxes
+bj ← InterpolateBbox (bj , v, v 0 )
+// Interpolate layout box to OCR image size
+
+12
+
+13
+14
+15
+
+(2) Find closest start and end token-boxes
+Input
+: a set of DLA predictions DLA(v), a set of OCR tokens u, a set of OCR
+token-boxes s
+Output
+: an updated set of OCR tokens û, a set of OCR token-boxes ŝ
+for j ← 1 to J do
+S ← (0, ∞); E ← (−1, ∞)
+// Initialize start and end with dummy index and distance values
+for t ← 1 to T do
+// Multiple relaxing heuristics to find closest token-box to layout-box
+
+17
+
+if cj ∈ Ignore-labels then
+continue
+
+18
+
+if not FullyContains(bj , st ) or IntersectionOverUnion(bj , st ) > ζiou then
+
+16
+
+// Token-box fully contained within layout-box or IoU > threshold
+
+continue
+
+19
+
+// Minimal Laplacian distance to cornerpoint
+
+20
+21
+22
+
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+
+S ← min(S, (t, Laplacian(bj , st )))
+E ← min(E, (t, Laplacian(bj , st )))
+
+// Laplacian distance to top-left corner
+// Laplacian distance to bottom-right corner
+
+(3) Insert DLA labels before and after closest tokens
+Input
+: The original sets of OCR tokens u, token-boxes s, and start and end
+indices S and E
+Output
+: Updated sets of OCR tokens û and token-boxes ŝ
+C←0
+// Initialize token insertion counter
+û, ŝ ← u, s
+// Initialize to be updated OCR tokens û and token-boxes ŝ
+I ←SortAndLabel(S,E)
+// sort start and end token together by index and add label type
+for j ← 1 to |I| do
+if Ij is a start token then
+û ← insert <cj > at Ij + C
+// Insert label such as <Table> before token
+ŝ ← insert bj at Ij + C
+C ←C+1
+if Ij is an end token then
+û ← insert </cj > at Ij + C + 1
+ŝ ← insert bj at Ij + C + 1
+C ←C+1
+return û, ŝ
+
+// Insert label such as </Table> at next token
+
+// Tokens and token-boxes with DLA labels to be used in prompt design of [482]
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0192.txt b/assets/txts/pg_0192.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cbb84112ea0b2faa6429cddc3e238a65e1221071
--- /dev/null
+++ b/assets/txts/pg_0192.txt
@@ -0,0 +1,280 @@
+160
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+Table 6.2.
+Prompt design following [482], with placeholders depending on
+parameterization of document input (plain, space, DLA).
+#l
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+
+Prompt
+You are asked to answer questions asked on a document image.
+The answers to questions are short text spans taken verbatim from the document.
+This means that the answers comprise a set of contiguous text tokens present in the document.
+Document:
+{Layout Aware Document placeholder}
+Question: {Question placeholder}
+Directly extract the answer to the question from the document with as few words as possible.
+Answer: {}
+
+Table 6.3. Results for KD methods applied on DocLayNet [362].
+
+Teacher
+
+Student
+
+Method
+
+mAP↑
+
+Flops↓
+
+Params↓
+
+Im/s↑
+
+ViT-B
+R101
+-
+
+ViT-T
+R50
+
+Supervised
+Supervised
+Supervised
+Supervised
+
+65.65
+73.56
+62.85
+72.43
+
+107G
+60G
+68G
+33G
+
+114M
+63M
+26M
+44M
+
+20
+12
+14
+12
+
+R101
+
+R50
+
+ViT-B
+
+ViT-T
+
+SimKD
+ReviewKD
+SimKD
+ReviewKD
+
+62.71
+61.17
+57.51
+57.2
+
+29G
+37G
+42G
+84G
+
+44M
+44M
+26M
+26M
+
+21
+19
+22
+17
+
+nature of detection, introducing new obstacles related to regression, region
+proposals, and sparser label volumes [64]. As motivated in Sec. 6.3.3, we
+prioritize feature-based KD methods, with results on DocLayNet in Tab. 6.3.
+The performance comparison in terms of mean average precision mAP and
+FLOP counts show that Resnet-50 students with SimKD are overall superior in
+Table 6.4. Validation ANLS (scaled to %) of Llama-2-7b-chat [452] on SP-DocVQA
+[309] (top) and InfographicVQA [310] (bottom), where (if marked) the prompt is
+enriched with DLA predictions from a ViT-B-based Mask-RCNN.
+space
+
+task
+
+DLA
+
+X
+7
+X
+7
+
+X
+X
+X
+X
+
+X
+X
+7
+7
+
+space
+
+task
+
+DLA
+
+X
+7
+X
+7
+
+X
+X
+X
+X
+
+X
+X
+7
+7
+
+ANLSval Image/Photo Yes/No Figure/diagram Form Free_text Handwritten Layout Others Table/list
+61.2
+58.39
+62.46
+57.63
+
+44.58
+44.43
+42.95
+45.38
+
+49.13
+41.67
+49.43
+51.52
+
+40.28
+34.81
+40.93
+34.97
+
+68.95
+66.38
+71.15
+67.88
+
+68.39
+67.82
+70.59
+69.71
+
+52.81
+52.1
+55.87
+53.19
+
+61.38
+59.19
+61.87
+55.51
+
+56.44
+55.91
+61.05
+55.78
+
+56.7
+52.79
+58.31
+53.81
+
+ANLSval Arithmetic Comparison Counting Figure Map Multi-span Abs Q span Single span Table/list Text Visual/layout
+28.05
+28.36
+27.97
+29.08
+
+9.92
+
+25.28
+
+7.83
+
+26.28 19.0
+
+21.85
+
+8.82
+
+41.84
+
+33.54
+
+25.57
+
+34.6
+
+29.17
+
+14.93
+9.78
+14.15
+
+29.15
+25.13
+26.94
+
+7.64
+6.99
+11.35
+
+27.05 19.0
+25.93 21.04
+27.52 19.1
+
+19.41
+22.33
+19.79
+
+11.21 46.87
+8.2 43.36
+12.79 48.44
+
+33.35
+33.53
+33.79
+
+25.56
+25.76
+26.17
+
+34.59
+35.06
+35.24
+
+26.69
+27.47
+26.39
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0193.txt b/assets/txts/pg_0193.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3acd835d931478810cea4c2843c02db7d90f87a6
--- /dev/null
+++ b/assets/txts/pg_0193.txt
@@ -0,0 +1,44 @@
+RESULTS & DISCUSSION
+
+161
+
+terms of both efficiency and detection, while ViT-Tiny student has the smallest
+number of parameters with comparable performance in terms of mAP.
+)However, one can observe a generally large knowledge gap between the teacher
+and student model (≈ 8% for ViT and ≈ 10% for the ResNets) as the crucial
+details about the document object boundaries, shapes, and sizes can get lost
+during the compression process. Not only that, KD performance with a ViT
+backbone is worse compared to Resnets due to (i) the attention overhead, i.e.,
+transferring this attention-based knowledge to a student model requires careful
+consideration of how to distill these complex attention patterns effectively, and
+(ii) initialization and hyperparameter sensitivity, e.g., finding an appropriate
+domain pretrained checkpoint and setting patch sizes, attention heads, can affect
+the KD process, requiring more delicate tuning. The CNN layers of Resnets are,
+on another hand, permutation invariant and provide more flexibility towards
+KD.
+KD methods are hard to integrate for object detection frameworks, especially
+when it comes to ViTs where there is no intermediate multi-scaled FPN module.
+Our contribution lies in extending the hybrid SimKD [63] method for the
+DLA task and also showing competitive analysis with the existing SOTA
+ReviewKD [67].
+Downstream DLA-KD Tab. 6.4 reports results on the validation sets as these
+are hyper-annotated with evidence, question and answer types, and operations,
+allowing for more finegrained analysis. Detail results of distilled DLA-enriched
+prompts are available in Appendix D.4.
+On SP-DocVQA, DLA-enriched prompting (without spacing) improves from
+57.63 → 58.39, whereas (with spacing) the improvement (27.97 → 28.05) is
+less pronounced on InfographicVQA, yet DLA predictions are still useful in
+this setting, as also evidenced by questions involving ’Visual/Layout’. This is
+likely due to the more visual and layout complexity of the dataset, wherefore
+DLA predictions are less accurate. Strikingly, spacing performs generally worse
+on Infographics, pointing to the heuristic nature of the structure-preserving
+OCR algorithm of [482] that fails on structurally complex documents with
+visually-situated language, charts with axes labels, legends, etc.
+The objective of these experiments was to make (distilled) DLA output useful
+in enriching text-only LLMs with more semantic layout information beyond
+geometric-spatial relations. For every setting tested, the task instruction
+(Sec. 6.3.5) is vital (else ANLS < 5%) in the zero-shot setting. We hypothesize
+that for SP-DocVQA line/row/column-level key-value pair recognition suffices
+for attaining good performance, thus expecting little benefit from DLA-enriched
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0194.txt b/assets/txts/pg_0194.txt
new file mode 100644
index 0000000000000000000000000000000000000000..539574b060658bcf1f0a3f2a3898209e0882f134
--- /dev/null
+++ b/assets/txts/pg_0194.txt
@@ -0,0 +1,36 @@
+162
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+prompts. However, as these experiments are bound to the layout classes as
+pre-defined in DocLayNet, we believe that richer layout information, closer to
+semantic regions (e.g., an address block instead of an OCR block), and including
+specification of common document objects such as stamps, logos, watermarks,
+should benefit downstream DU tasks.
+Table 6.5. Performance per KD method over metrics averaged over architectures on
+RVL-CDIP dataset (In-Domain) and RVL-CDIP-N dataset (Out-Of-Distribution).
+
+DIC-KD This task benchmark reports on experiments with 3 backbones,
+2 student architectures (except 1 for Resnet), and 6 KD methods each.
+Tab. 6.6 details the ViT and DiT results, whereas the ResNet results (following
+similar trends) are available in Appendix D. The same set of experiments was
+repeated for randomly initialized students (Tabs. D.12 and D.13). Given the
+comprehensive scope of the DIC experiments, we can make claims regarding the
+overall most performant KD method, the teacher-student capacity gap, and the
+architecture-pretraining gap. ViT-Small student distilled with the SimKD [63]
+method performs best in terms of accuracy and AURC. Note that the best
+ViT-Tiny student with only 5.5M parameters reaches 83% accuracy with SimKD,
+only 2.9% behind the best ViT-Small student with 86M parameters, showing the
+potential of advanced KD methods in retaining accuracy at such a large capacity
+gap. SimKD performs admirably in terms of accuracy, sometimes (depending
+on the projector type (MLP and CNN)) as well as the supervised teacher. In
+terms of AURC, NKD and MSE approaches are best-performing, which are
+both response-based methods. Regarding the pretraining gap, as shown in
+Tab. 6.6, results indicate that a self-supervised teacher like DiT does not meet
+expectations when distilling the knowledge to a ViT-based student pretrained
+with ImageNet weights. This could be attributed to the large representation gap
+in the feature space between the RVL-CDIP pretrained and ImageNet pretrained
+models. However, evaluation under covariate shift on RVL-CDIP-N (Tab. D.8)
+demonstrates DiT-based students (distilled with response-based KD strategies)
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0195.txt b/assets/txts/pg_0195.txt
new file mode 100644
index 0000000000000000000000000000000000000000..540acddc76ab63bb2baada7e72630c1ca09ee9f4
--- /dev/null
+++ b/assets/txts/pg_0195.txt
@@ -0,0 +1,185 @@
+CHAPTER CONCLUSION
+
+163
+
+to outperform ViT→ViT students, pointing to the potential of self-supervision
+for robustness to distribution shift.
+Table 6.6. Results of different KD strategies benchmarked for D/ViT-B teachers
+applied on the RVL-CDIP dataset.
+Student
+–
+–
+–
+ViT-S
+
+ViT-T
+
+Method
+ViT-B
+ViT-S
+ViT-T
+
+ViT-B
+
+Vanilla [τ = 2.5, α = 0.5]
+NKD [τ = 1, γ = 1.5]
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet [middle]
+Vanilla [τ = 2.5, α =]
+NKD [τ = 1, γ = 1.5]
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet [middle]
+
+ACC
+0.891
+0.853
+0.822
+0.854
+0.840
+0.855
+0.859
+0.847
+0.843
+0.825
+0.815
+0.823
+0.830
+0.829
+0.812
+
+AURC
+0.017
+0.030
+0.040
+0.028
+0.036
+0.028
+0.028
+0.062
+0.048
+0.038
+0.046
+0.040
+0.095
+0.056
+0.051
+
+ECE Student
+0.034
+–
+0.058
+–
+0.043
+–
+0.049 ViT-S
+0.074
+0.051
+0.287
+0.141
+0.141
+0.058 ViT-T
+0.094
+0.066
+0.163
+0.150
+0.153
+
+Method
+DiT-B
+ViT-S
+ViT-T
+
+DiT-B
+
+Vanilla [τ = 2.5, α = 0.5]
+NKD [τ = 1, γ = 1.5]
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet [middle]
+Vanilla [τ = 2.5, α =]
+NKD [τ = 1, γ = 1.5]
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet [middle]
+
+ACC
+0.933
+0.831
+0.801
+0.831
+0.790
+0.831
+0.838
+0.851
+0.775
+0.801
+0.772
+0.795
+0.816
+0.832
+0.753
+
+AURC
+0.075
+0.042
+0.053
+0.060
+0.058
+0.060
+0.087
+0.048
+0.063
+0.064
+0.066
+0.076
+0.104
+0.056
+0.077
+
+ECE
+0.010
+0.056
+0.047
+0.080
+0.040
+0.082
+0.438
+0.136
+0.077
+0.081
+0.041
+0.081
+0.439
+0.152
+0.054
+
+Covariate shift DIC-KD To answer if certain KD methods harm a student
+model’s robustness to covariate shift, we plot results per KD method, averaged
+over the 3 backbones on the (Tab. 6.5). This re-establishes the superiority
+of SimKD [CNN] in terms of accuracy, both ID and OOD, yet due to poor
+calibration, it loses gain on the teacher in terms of AURC. Strikingly, MSE
+attained the lowest OOD performance, whereas it was a solid ID choice. Tab. D.8
+provides more detail on the performance of the different KD methods on RVLCDIP-N, where we observe that grouped per KD strategy response-based is
+superior over all metrics.
+
+6.5
+
+Chapter Conclusion
+
+KD-based model compression has been a popular technique in recent years,
+albeit DU research has not paid much attention to efficiency. Our work explores
+a limited scope of KD for DU at scale, revealing great potential for creating
+efficient counterparts of cumbersome DLA models used today. Specifically, we
+show that SimKD is a particularly strong KD method, always outperforming
+vanilla KD and even obtaining a 16x smaller model retaining >90% relative
+accuracy. Moreover, we investigate the potential of DLA for enriching document
+inputs in downstream DocVQA tasks. Traditionally, DocVQA has relied on plain
+OCR text. While structure-preserving OCR provides a notion of geometric
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0196.txt b/assets/txts/pg_0196.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c18b4977935bff3c0e05e26671e850e91e16b11e
--- /dev/null
+++ b/assets/txts/pg_0196.txt
@@ -0,0 +1,30 @@
+164
+
+DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS
+
+layout for downstream use, DLA was never considered before for the same
+purpose, yet our experiments show promise.
+The more comprehensive benchmarking of KD methods in DIC with ID
+evaluation and a covariate shift protocol reveals interesting observations
+regarding the feature representation and weight initialization gap between
+DiT (documents) and ViT (natural images), albeit self-supervision for students
+is more robust in the OOD setting. Our framework enables informed selection of
+compressed models and directs several interesting explorations: how pretraining
+objectives impact the distillation process, if different layout representations
+(e.g., [15, 187, 263, 443, 555]) allow for a more robust downstream transfer, etc.
+Limitations While we primarily use DocLayNet, it remains the DLA dataset
+with the most diversity in layout elements both in terms of categories and shape
+or size. However, the downstream DocVQA results urge for more diversity
+in terms of document types, domains, and objects (e.g., layout objects such
+as logos, watermarks, stamps, signatures). Thus, the community is in dire
+need of a dataset diverse enough to guarantee a performance improvement
+downstream. Moreover, multimodal KD was not considered in this work,
+holding promise for more efficient, all-round DU models. The downstream task
+was not tested on [468] as multipage documents are more complex to benchmark
+with limited sequence length LLMs. Also, DLA being a fairly complicated
+instance segmentation task, makes it difficult to adapt for KD-based model
+compression, ruling out some KD methods. This calls for a better experimental
+framework and architectural modeling to boost the exploration of KD in DLA, in
+turn, incubating downstream advances in processing and understanding VRDs.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0197.txt b/assets/txts/pg_0197.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a27bfa2b4a7dbcdf2572753674bda416bb97d2a
--- /dev/null
+++ b/assets/txts/pg_0197.txt
@@ -0,0 +1,30 @@
+Chapter 7
+
+Conclusion
+This final chapter summarizes the work done in this thesis. Additionally, we
+formulate the key contributions and propose some exciting avenues for future
+research.
+
+7.1
+
+Summary
+
+To summarize, this thesis contains the following contributions (C) and key
+findings (→), respective to the research questions from the introduction:
+When tested in realistic language data distributions on various text classification tasks,
+how well do PUQ methods fare in NLP?
+In which settings are PUQ methods most useful, i.e., which failure sources/distribution
+shifts are the most sensitive?
+
+C 1. We conduct a benchmarking study of established PUQ methods applied
+to six real-world text classification datasets with a focus on model robustness
+and uncertainty quality. This large-scale study comes with advanced statistical
+analysis to validate significant differences between methods and datasets.
+C 2. We propose a practical experimental methodology to test relevant
+distributions shifts —cross-domain classification and novelty detection—,
+resulting in a better understanding of the individual shortcomings of PUQ
+methods.
+
+165
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0198.txt b/assets/txts/pg_0198.txt
new file mode 100644
index 0000000000000000000000000000000000000000..764e9237f1d61c7ea7f74f477a4fca657719c1f9
--- /dev/null
+++ b/assets/txts/pg_0198.txt
@@ -0,0 +1,35 @@
+166
+
+CONCLUSION
+
+→ General behavior of PUQ methods does not hold over different datasets. We
+do observe specific correlations between PUQ methods and the problem setting
+representing task characteristics, for which we formulated practical takeaways.
+This reconfirms the need for modality to task-specific benchmarking of PUQ
+methods.
+→ In general, PUQ methods are sensitive to distribution shifts and methods that
+exhibit better in-domain calibration also exhibit better robustness to novel class
+shifts. The tested setting of cross-domain classification under covariate shift
+is the most challenging for PUQ methods. This is evident from relatively low
+AUROC scores due to the presence of comparably similar linguistic patterns
+across domains.
+How can we obtain better PUQ estimates without overrelying on computationally
+prohibitive methods, e.g., Deep Ensemble [238]?
+
+C 3. We propose novel combinations of PUQ methods, providing both wellmotivated intuition and empirical evidence for the complementary benefits of
+combining different posterior approximation procedures.
+
+→ Our proposed hybrid PUQ methods improve over singular methods, both in indomain calibration, novelty detection, and out-of-domain detection. In particular,
+we show that the combination of Deep Ensemble with Concrete Dropout
+demonstrates higher diversity in posterior samples and superior performance,
+even at a smaller ensemble size compared to a Deep Ensemble.
+How important are certain prior, neural architecture or hyperparameter influences on the
+quality of PUQ estimation?
+
+C 4. We conduct a range of ablation experiments to investigate the influence of
+prior, neural architecture and hyperparameter choices on the quality of PUQ
+estimation. In particular, the number of stochastic posterior samples, the dropout
+rate, and the architecture are shown to have a significant impact on the quality
+of PUQ estimation.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0199.txt b/assets/txts/pg_0199.txt
new file mode 100644
index 0000000000000000000000000000000000000000..077b5c0d64b92ca92293adea39fa54193dd891d1
--- /dev/null
+++ b/assets/txts/pg_0199.txt
@@ -0,0 +1,31 @@
+SUMMARY
+
+167
+
+→ The combination of posterior geometry and weight-based priors proves to be
+a powerful combination for PUQ estimation, with the Deep Ensemble and
+Concrete Dropout methods as the best-performing methods in our benchmark.
+Nevertheless, it is important to consider adapting the dropout rate to the text
+classification task at hand, which individually and in an ensemble impacts model
+robustness and uncertainty quality.
+→ Contrary to previous work, we find that pretrained transformers in NLP
+severely underperform in novelty detection compared to 1D CNNs, limiting the
+applicability of transfer learning when distribution shift from novel classes can
+be expected.
+How severe is the problem of hallucination and control in LLMs when evaluated in a
+selective, free-form DocVQA task setting?
+
+C 5. We design the DUDE dataset with this task setting in mind, incorporating
+a large set of unanswerable questions that are realistic and relevant to the
+document’s content.
+→ Hallucination and control remain severe problems in LLMs, with a large fraction
+of unanswerable questions being answered with high confidence. When trained on
+a large set of unanswerable questions, LLMs improve on identifying unanswerable
+questions, yet at the expense of abstractive, harder questions to which they
+become overcautious (e.g., ChatGPT predicting more than 1/2 of abstractive
+questions as unanswerable). With longer context, LLMs are also more likely
+to hallucinate answers. Overall, results are lagging behind the human baseline
+performance on DUDE, indicating that LLMs are still far from being able to
+reason about documents in their entirety without control measures.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0200.txt b/assets/txts/pg_0200.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dba26c61ef7ea379c56219d8db015f1da1e9927e
--- /dev/null
+++ b/assets/txts/pg_0200.txt
@@ -0,0 +1,29 @@
+168
+
+CONCLUSION
+
+How can we iteratively close the gap between research and practice in DU?
+
+C 6. We take stock of the balance between research and applications in document
+classification, a prototypical DU task, and we identify the main challenges that
+are stalling progress in the field, with a focus on data construction and evaluation
+methodology.
+C 7. We propose a novel formalization of multipage document classification
+scenarios, which we use to construct two novel datasets, RVL-CDIP_MP and
+RVL-CDIP-N_MP, that are more realistic and more challenging than their
+single-page counterparts.
+C 8. We conduct an insightful experimental analysis of the novel datasets.
+
+→ The experimental analysis reveals that current SOTA models are not able to
+leverage the additional context provided by multipage documents and that the
+performance gap between single-page and multipage document classification
+is still large. Ablation experiments show the promise of advancing multipage
+document representation learning and inference.
+→ Major dataset construction efforts are required to bridge the currently existing
+gap and be able to rely on benchmarks for transfer to real-world applications.
+In particular, we identify the need for more realistic and more challenging
+datasets, about e.g., the type and diversity of document data, and the variety
+and quality of label sets, as well as the need for more comprehensive evaluation
+methodologies.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0201.txt b/assets/txts/pg_0201.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f5549ac37bd0bb7b32847e9f4c2d04f39d7bcc68
--- /dev/null
+++ b/assets/txts/pg_0201.txt
@@ -0,0 +1,29 @@
+SUMMARY
+
+169
+
+How can we design a resource that comprehensively challenges the state-of-the-art?
+Which DU aspects are most challenging for current state-of-the-art LLMs? How can these
+be incorporated in a benchmark to allow proper measurements of future improvements?
+
+C 9. We have designed a completely novel benchmark from the ground up,
+DUDE, collecting 40K QA pairs for 5K documents, constructing a multifaceted (multipage (µ = 6), multi-domain (±15), multi-type (±200), multiQA (extractive, abstractive, list, unanswerable), multi-task (DIC, KIE, DLA,
+DOD, etc.), multi-OCR (Tesseract, Azure, AWS), multi-source, multi-stage (<5)
+annotations) dataset to foster research on generic DU, bypassing long context
+restrictions and evaluating the reliability and robustness of DU technology, as
+close as possible to real-world requirements.
+C 10. The dataset construction approach of DUDE is based on a set of principles
+that we have formulated, which we believe are essential for a comprehensive
+benchmark for generic DU. More specifically, leveraging the DocVQA task
+paradigm and learning paradigm of Multi-Domain Long-Tailed Recognition
+allowed us to both incentivize harder questions on visual/layout semantics,
+layout navigation, or multi-step reasoning, while organically obtaining questions
+relevant to the document type and instance.
+C 11. We have conducted our own baseline experiments of DUDE, evaluating the
+performance of SOTA DU models on the different facets of DUDE, as well as the
+reliability and robustness of LLMs in the context of DU. Next, we have organized
+a competition to challenge the community’s best, additionally incorporating
+OOD detection and selective generation to evaluate CSFs on two common failure
+sources.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0202.txt b/assets/txts/pg_0202.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0c6fd5aa7fbd7e477678a5c012fb255691d4827e
--- /dev/null
+++ b/assets/txts/pg_0202.txt
@@ -0,0 +1,42 @@
+170
+
+CONCLUSION
+
+→ The best results attain ANLS <= 50% with our baseline T5-2D (8K context)
+scoring 46%, the competition winner improves 4% absolute by leveraging
+multimodal LLMs (BLIP2 and ChatGPT). Generally, stronger performance
+is expected from models that incorporate layout understanding and reasoning
+over multiple pages. Nevertheless, diagnostic results prove that the current
+SOTA still suffers on questions with visual evidence (only half of the human
+performance) or any reasoning operations (counting, comparison, etc.). With
+the rise of multimodal LLMs, better solutions are coming, yet due to its designed
+complexity, DUDE might remain “the benchmark to beat" for a long time.
+→ Even while DUDE presents a great test bed for the challenge of long-context
+processing (Section 2.3.4.1), the evaluated models have not yet reached the point
+where they can fully leverage the additional context. This is a clear indication
+that more research is needed in the direction of efficient processing of long,
+structured documents.
+→ We find that the quality of confidence estimation worsens with longer context,
+potentially from having to consider more possible answers. We also find that
+models using a maximum confidence strategy over answers generated per page
+results in substantially worse calibration. These interactions between multiple
+DU challenges prove the usefulness of incorporating and evaluating these jointly
+in a benchmark.
+How can we efficiently infuse LLMs with semantic layout awareness for more focused
+information extraction?
+To what degree can model compression resolve the problem of efficiency in processing
+documents?
+
+C 12. We propose a novel experimental methodology to investigate enrichment
+of VRDs with semantic layout structure derived from effective distillation of
+DLA models to practically and efficiently improve downstream DU applications.
+This includes evaluation under covariate shift of KD methods in DIC and a
+downstream evaluation setup to evaluate the robustness of distilled DLA models
+on zero-shot layout-aware DocVQA.
+C 13. We present the first application of KD to visual document tasks (DIC,
+DLA), investigating the teacher-student knowledge gap in KD-based model
+compression methods (response and feature-based) with task architectures
+involving different inductive biases (CNN vs. ViT), pretraining (self-supervised),
+student initialization, and capacities (base-small-tiny).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0203.txt b/assets/txts/pg_0203.txt
new file mode 100644
index 0000000000000000000000000000000000000000..260b5b3f55281759863594597306c9edc80cae4d
--- /dev/null
+++ b/assets/txts/pg_0203.txt
@@ -0,0 +1,44 @@
+PERSPECTIVES FOR FUTURE RESEARCH
+
+171
+
+→ While we have promoted the use of semantic layout over geometric layout for
+enriching LLM prompts, this only results in limited improvements in performance,
+which we attribute to either the zero-shot evaluation setup or the limited subset
+of layout classes and domain shift from the DLA training data (DocLayNet).
+In some cases, e.g., questions involving visual/layout evidence, DLA-enriched
+prompting proves more useful.
+→ KD-based model compression is very effective in reducing model size, while
+maintaining accuracy at large capacity gaps, e.g., a strong student is SimKD
+ViT-tiny, which retains relatively 93% of teacher accuracy, while being 16x
+smaller. Ablations show how the teacher-student knowledge gap is affected by
+the inductive biases of the task architecture, the pretraining of the student, the
+student initialization, and the student capacity. For example, a self-supervised
+teacher provides more robust students when evaluated under covariate shift.
+Nevertheless, model compression is but one tool in a larger toolbox for efficient
+processing of documents, which we believe is a key challenge going hand-to-hand
+with efficient longer-context modeling, for future research.
+As this thesis was conducted in an applied research environment and keeping in
+mind that nowadays DL research is primarily empirical, the contributions of our
+work have been very focused on datasets and the experimental methodology,
+rather than on novel algorithms, which more often than not present mere
+incremental improvements on the state-of-the-art. Nevertheless, we believe
+that the proposed datasets and experimental methodologies are of great value
+to the community, as they provide a more realistic and more challenging test
+bed for future DU research. We are happy to see the proposed datasets and
+experimental methodologies increasingly being adopted by the community and
+hopefully this will foster research on more efficient and closer to real-world
+document processing, which will ultimately lead to more reliable and robust
+DU technology.
+
+7.2
+
+Perspectives For Future Research
+
+This Section discusses some exciting research opportunities left for future
+research. First, we present a curated set of research questions particular to
+PUQ, calibration, and failure prediction, which when relevant are linked to DU
+applications. Next, we take a futuristic look at the design of a fully-fledged
+IA-DU solution, dreaming up the ultimate dataset and system design for DU.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0204.txt b/assets/txts/pg_0204.txt
new file mode 100644
index 0000000000000000000000000000000000000000..917f59b31ce7692852e146c00cd9da5ea42bcec7
--- /dev/null
+++ b/assets/txts/pg_0204.txt
@@ -0,0 +1,49 @@
+172
+
+7.2.1
+
+CONCLUSION
+
+Open Problems In Reliability & Robustness
+
+Recent advancements in LLMs have brought a lot of groundbreaking
+improvements to the field of DU, yet the reliability of LLMs is still far from being
+solved. This is further increased by API-based services or closed-source LLMs
+[344], which are to be treated as black-boxes without access to model internals or
+token-level output logits, making it hard to apply most PUQ methods. Popular
+white-box approaches include verbalized probabilities [273] or semantic entropy
+[226] for taking into account semantic equivalence or specificity (e.g., Where
+was the 2023 International Conference on Computer Vision held? → In Paris vs.
+In the capital of France vs. In Europe). Specific to selective generation, when
+knowledge on a topic is limited, it can be hard to censor LLM outputs (even
+when finetuning further with human feedback) or evaluate abstention reliably
+(e.g., I don’t know vs. I don’t care vs. ‘’).
+[111] implement a framework bundling a battery of white-box and black-box
+methods for LLM confidence estimation in text generation, yet it still requires
+human inspection of generated text together with the confidence score, which
+is not very scalable for large-scale document processing. This ties into the
+evaluation crisis of LLMs, which is a topic of active research [137]. In the short
+term, it might suffice to reward models that predict the full distribution of
+human judgments or learn human preferences for generated text. However, how
+can we expect models “to do what humans do” when even humans disagree or are
+not consistent in their judgments? Alternative approaches can be to rationalize
+judgments, attribute or ground evidence used for the judgment, or ask for
+clarifications when needed. In the long term, we should move beyond human
+evaluation, which is expensive, time-consuming, and not scalable. Important
+explorations include prompt chaining (Please give a confidence between 0 and 1
+about how certain you are this is the correct answer) or self-evaluation [207, 391]
+to induce reflections on the quality of LLM outputs.
+Beyond the potentially infinite, though countable output spaces of generative
+tasks, there exists an opportunity to study calibration for specific output
+spaces, e.g., sequence-structured in the context of sequence tagging or restricted
+sequence-to-sequence tasks. Moreover, calibration metrics and methods can
+be adapted to the specific task or output space such as structured prediction
+[227], named entity recognition [222], object detection and segmentation [85,
+234, 350] etc. With most works (if at all) reporting top-1 miscalibration, efficient
+estimation of “stronger" calibration notions is a crucial area of study to inform
+the derivation of calibrated regularized loss functions [370]. On the more
+theoretical side, it remains vital to investigate the link between non-convex
+optimization (e.g., flat minima) and calibration, as well as when optimizing a
+proper loss yields calibration [42, 549].
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0205.txt b/assets/txts/pg_0205.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b84b7e40962befd9c2ecba3218d4468e5426844a
--- /dev/null
+++ b/assets/txts/pg_0205.txt
@@ -0,0 +1,45 @@
+PERSPECTIVES FOR FUTURE RESEARCH
+
+173
+
+Selective prediction has been garnering increased attention thanks to intensively
+comprehensive benchmarks [127, 193], yet these have (again) been focused on
+vision problems and architectures, inviting the same level of benchmarking on
+alternative modalities and tasks. To the extent of our knowledge there exists no
+work on extending selective prediction methodology to multi-task settings (e.g.,
+consider the typical combination of document classification and KIE) requiring
+a more complex learned CSF (for different output spaces) or a combination of
+multiple CSFs with multiple thresholding. Similar to calibration, differentiable
+loss functions for failure prediction are an open problem. More theoretical
+questions include the relationship between stronger notions of calibration and
+confidence ranking, as well as the link between feature space disentanglement and
+CSF ranking [552]. In the low-data regime, sample-efficient failure prediction
+is an open problem, which could leverage connections to semi-supervised and
+active learning [112].
+
+7.2.2
+
+A Future-Proof Design Of IA-DU
+
+Downstream datasets are a key component of any practical, supervised ML
+solution, yet they are often overlooked in expectation of decent zero-shot
+performance with LLMs, which are trained on large-scale, generic language
+datasets, such as Common Crawl or the Pile [130]. While these datasets are very
+useful for pretraining general language understanding, they are not sufficient
+for all possible downstream tasks. This is especially true for DU, where text is
+but one of the modalities to be considered. As part of the conclusion to this
+thesis, we first discuss how to obtain the ultimate dataset for generic DU, and
+next we detail the design of a fully-fledged IA-DU solution.
+7.2.2.1
+
+The ‘Ultimate’ DU Dataset?
+
+Arguably, a core contribution of this thesis is the design of the DUDE dataset,
+which we believe is a step in the right direction toward the ultimate dataset for
+generic DU. Top-of-mind extensions of DUDE include: multilingual or crosslingual documents and questions; answer and evidence grounding to improve
+evaluation and interpretability; and question decomposition and simplification.
+Finding a complete answer to the question of the ultimate DU dataset would be
+transformative to DU technology, yet here we can only provide some pointers,
+discussed in the structure of goal, starting points, and aspects to target.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0206.txt b/assets/txts/pg_0206.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7d167d5e55a2ca65f6b80fe9d857f9a8dc5c1ff5
--- /dev/null
+++ b/assets/txts/pg_0206.txt
@@ -0,0 +1,45 @@
+174
+
+CONCLUSION
+
+Goal DU requires reasoning over documents in their entirety, which is a very
+complex task with the aforementioned challenges. With the current technology,
+this involves learning document representations that are both rich and compact,
+and that can be used to answer any question about the document. Consider
+how challenging this is when most relevant questions are either about the
+intentionality of the document’s author or the way a user interacts with it,
+hinting at a potential observer’s paradox in future data collection. For example,
+on a car invoice, an accountant would ask What is the total amount due?, or
+Is this a valid invoice with correct taxation?, while a customer would ask How
+much do I finally have to pay?, or the insurance broker What is the chassis
+identifier to link the omnium coverage to?. A model should be able to capture
+all these nuances about the complexity of a document which could be seen as
+the expectation of all possible relevant questions that can be asked on it, while
+also being able to generalize to unseen documents and questions. Therefore,
+the goal of the ultimate DU dataset is to provide a test bed for evaluating
+the progress in commonsense reasoning on documents from real-world
+interactions, to which we hypothesize that the scale and depth of supervision
+are vital.
+Starting points The ultimate DU dataset should be designed with the
+aforementioned goal in mind, yet some seminal ML datasets could be inspiring.
+While the ‘ImageNet moment’ is etched in everyone’s memory, MS COCO
+[274] was arguably a more impactful dataset thanks to its large-scale, diverse,
+and high-quality nature combining multiple tasks (image captioning, object
+detection, semantic segmentation, etc.). To build the equivalent of MS COCO
+in document understanding, DUDE offers a good starting point, under some
+conditions and necessary extensions. An important aspect concerns ground
+truth collection for DocVQA and the complexity and specificity of questions
+and answers, which has been approached differently by recent works: DUDE
+uses a multi-stage approach to collect a large set of minimally constrained,
+human-generated questions under the MDLT paradigm, which were afterward
+annotated with diagnostic categories; PDFTriage [400] pre-defines question
+types and collects a small set of human-generated questions; DocEdit [311]
+establishes a pre-defined taxonomy and tests language as a universal UI to
+interact with the hierarchical, discrete structure of documents. The extent to
+which the collected QA pairs constitute a representative sample of the space of
+all possible and relevant questions that can be asked on a document instance is
+an open problem, which can be approached by (A) extending and scaling up
+existing practices or (B) deepening supervision for models to generalize better
+from limited inputs.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0207.txt b/assets/txts/pg_0207.txt
new file mode 100644
index 0000000000000000000000000000000000000000..69371f16ab3d4db4766f12845cf4b4fe95b4144f
--- /dev/null
+++ b/assets/txts/pg_0207.txt
@@ -0,0 +1,47 @@
+PERSPECTIVES FOR FUTURE RESEARCH
+
+175
+
+A. Scale We identify three targets to scale up: (I) document collection, (II)
+question collection and validation, and (III) question-answer generation.
+(I) Throughout the document dataset construction, the goal is to collect a
+large set of diverse document types and instances, differing on all modalities:
+language, layout, visual, etc., and additional meta-criteria: industry, language,
+type, etc. The document collection approach taken in DUDE was a fairly
+artisanal process: based on experience, we designed an industry-document
+taxonomy, which we used to collect a large set of document types and instances,
+also taking into account the presence of different visual semantics or document
+objects e.g., handwriting, stamps, watermarks, address blocks, etc. We leveraged
+a semi-automatically created keyword-style search (’Please list 30 common retail
+document types with their synonyms like Credit memos - {"credit notes", "credit
+slips", "refund slips"}’) on public document collections, and validated diversity
+post-hoc in terms of modality-specific features (TF-IDF or ResNet features) vs.
+other datasets.
+A more scalable approach would be to leverage a cluster-based diverse sampling
+from larger document collections, such as Common Crawl [460]. While this
+approach would be more scalable, it would be challenging to ensure that the
+collected documents are diverse in terms of all modalities, which is a topic
+to be investigated. Relevant caveats are the presence of duplicates, sensitive
+information, and the need to balance language priors to not create Clever Hans
+effects for models to later exploit [405]. An active topic of research is document
+generation [169] or augmentation [304], which could fill the gap in document
+diversity, yet it would be challenging to ensure that the generated documents
+are both realistic and diverse. Seeing that business documents are hard to
+obtain, one could backtrack to visually-situated language.
+(II) To ensure that questions are specific to a document, and not testing language
+understanding, cross-lingual questions could help counter reliance on language
+priors. However, both multilingual documents and cross-lingual questions are
+challenging to collect, as they require annotators capable of reading multiple
+languages. How people interact (i.e., the questions asked) with documents
+without being systematically observed is what makes for interesting data, yet it
+is also the more challenging to collect. This is certainly true for subject-matter
+experts from different industries (government, finance, legal, etc.) who are not
+readily available for annotating documents. Naturally, as more documents
+are being collected, one should define a strategy to scale up the number of
+questions per document in a balanced way. Ideally, the number of questions per
+document should be a function of the document complexity, which is another
+open problem. Some basic strategies would be to (i) split questions evenly over
+pages by chunked annotation, yet this would constrain multi-hop and naturally
+complex questions, or (ii) to exploit the Gestalt principle [294], which states
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0208.txt b/assets/txts/pg_0208.txt
new file mode 100644
index 0000000000000000000000000000000000000000..749a5e2c5420ec586ff1495b3a24094067a8275e
--- /dev/null
+++ b/assets/txts/pg_0208.txt
@@ -0,0 +1,42 @@
+176
+
+CONCLUSION
+
+that the number of questions should be higher on heterogeneous elements in
+a document. Finally, an untapped approach would be to generate questions
+automatically, which is an open research challenge.
+(III) QA generation holds promise to grow a large-scale dataset. A possible
+approach would be to teach the current SOTA model on DUDE to generate
+questions (given possible answers, predict questions) similar to those in the
+training set. A harder problem is the generation of unanswerable questions,
+which we found hard to even elicit from humans. Potential caveats are the
+quality and factuality [303] of the generated questions. This might be improved
+upon by first generating rich and compositional captions for a document relative
+to the content and visual appearance, and then generating different questions
+based on the descriptions, with both paraphrasing and backtranslation for
+question variations and augmentations.
+B. Supervision Depth The reasoning behind increasing the depth of
+supervision is that we might be expecting too much, i.e., answering complex
+questions involving multiple manipulations of document-instance and/or domainspecific concepts based on a single set of reference answers, with a poor stimulus
+[476], i.e., not providing enough, complex enough and diverse enough examples
+for models to generalize well.
+Accounting for every possible question will be impossible. A possible approach
+inspired by MDLT and diagnostic categories in DUDE is to (i) decompose
+questions in terms of the skills and concepts (Definitions 15 and 16) required
+to answer it and pass this together as instructions; and (ii) hyperannotate
+more explicit answers, with answer and evidence grounding for attribution,
+better explaining the relations between primitives (skill-concept compositions).
+Figure 7.1 illustrates an example of (ii), where the answer is decomposed into a
+skill-concept composition, and the evidence is grounded to the relevant document
+objects. Such rich supervision should help models to both discriminate known
+skills and concepts and generalize better to new skill-concept compositions.
+Although it would be expensive to obtain such supervision in large quantities,
+the use of human-in-the-loop or active learning could reduce the annotation
+burden.
+Definition 15 [concept]. An abstract term to denote document visual objects
+(atomic [cell, barcode] and molecular [table, chart, form]), and entities (generic
+[document identifier, person, date] and domain-specific [invoice number, insured,
+payment date]).
+Definition 16 [skill]. Any manipulation [existence, counting, relation,
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0209.txt b/assets/txts/pg_0209.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fd193913cf517a2e374ffee2d8f5be6564c1d0e2
--- /dev/null
+++ b/assets/txts/pg_0209.txt
@@ -0,0 +1,31 @@
+PERSPECTIVES FOR FUTURE RESEARCH
+
+177
+
+Figure 7.1. Example of ground truth formatting for a question-answer pair in DUDE.
+
+hasattribute, etc. ] of a concept, or a combination of involved concepts (evidence)
+involved.
+Our overall idea is similar to how [243] alludes to intelligence: “the ability
+to decompose a problem into a set of skills and concepts, to reuse those skills
+and concepts in new situations, or acquire new ones quickly”. The proposed
+format would be a full-featured instruction tuning dataset, which has proven
+very useful in other settings [404, 486] and which could be a valuable resource
+for future research on instruction-based learning of already existing and future
+DU tasks.
+Naturally, all of this relies on the assumption that each question-answer pair
+can be decomposed into skill-concept compositions, and that there exists an
+exhaustive taxonomy of skills and concepts for DU, which thus far has not
+been created. A possible approach would be to leverage existing resources
+such as VerbNet [410] to define skills, or build an API for DocVQA similar to
+[437, 535] to decompose questions into programs with subroutines, e.g., How
+many of the contract’s pages have signatures? Counting([Navigation(document),
+Existence(signature, page)]); and construct a complete taxonomy of document
+concepts in both a bottom-up (human prior) and top-down (data-driven) fashion
+to extend it over time with domain-specific concepts. Ideally, this taxonomy
+should not be static at inference time, hinting at more research needed into
+neuro-symbolic learning for dynamic knowledge graphs to assist in recognizing
+and adding new concepts [32].
+There are several fundamental questions that can be asked here “Is it needed to
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0210.txt b/assets/txts/pg_0210.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c061f3e72b3b5bb0f772e68d891e84bdaeb94e36
--- /dev/null
+++ b/assets/txts/pg_0210.txt
@@ -0,0 +1,47 @@
+178
+
+CONCLUSION
+
+collect thousands of QA pair examples to learn a specific document skill-concept
+composition, e.g., address block detection?” Recent works seem to suggest
+not, indicating an emergent ability of the current best LLMs to find zero-shot
+solutions to a broad range of analogy problems [486]. Finally, building ground
+truth more amenable to advanced prompting and instruction-based learning
+[248] will likely prove as useful as question decomposition has in semantic
+parsing [189, 358, 525].
+7.2.2.2
+
+A Feature-complete IA-DU Solution?
+
+The main takeaway of this thesis is that while more compute, more data and more
+powerful algorithmic tools have allowed significant progress in DU, there is still a
+long way to go toward the objective of reliable, robust, realistic, and efficient DU.
+For now, a major component would be a general-purpose Transformer-based
+stack for interfacing with a document through natural language. Most likely, this
+would be a multimodal LLM pretrained with a variety of pretraining objectives
+on the richest and largest possible corpus of documents and related data. When
+zero-shot performance is not sufficient, it would be instruction finetuned on
+new QA pairs, e.g., in the rich format proposed in Section 7.2.2.1, resulting in
+efficient adapters that can be served concurrently on the same prediction model
+[417]. However, this is not a complete solution as generative modeling brings
+additional challenges (e.g., expensive pretraining, decoding-based inference,
+confidence estimation, dependence on human evaluation, scalability).
+Instead, we will focus here on another component of a complete solution, namely
+a failure forecaster, which we believe to be equally important for bringing
+LLMs closer to real-world applications. We envision this to be a lightweight
+module separate from the prediction model, that could be easily fully retrained
+and updated with new data, bypassing the risk of catastrophic forgetting and
+the need for retraining the more cumbersome LLM. The failure forecaster should
+predict the performance of the LLM on a given input (document, question,
+metadata etc.) and output (answer). It can be a very simple (e.g., logistic
+regression) or complex model (e.g., a large DNN), yet most of its complexity
+resides in the feature modeling and subsequent learning of sources of uncertainty.
+Our failure forecaster design is informed by [114]. We non-exhaustively identify
+sources of failure or uncertainty that can be modeled by the failure forecaster:
+(i) input uncertainty, (ii) output uncertainty, and (iii) distributional metrics.
+We discuss each of these in turn.
+(i) Before answering any question, the document instance should be analyzed
+for inherent uncertainty or quality issues: e.g., is it born-digital or OCR, the
+quality of OCR, readability metrics to capture how easy the document text
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0211.txt b/assets/txts/pg_0211.txt
new file mode 100644
index 0000000000000000000000000000000000000000..94bfe19ae8bcaeb0b27e9f670bcb2c103c057ed1
--- /dev/null
+++ b/assets/txts/pg_0211.txt
@@ -0,0 +1,32 @@
+PERSPECTIVES FOR FUTURE RESEARCH
+
+179
+
+is to read, the complexity of the layout graph, visual richness. Next, follows
+the question analysis: e.g., specificity, complexity, ambiguity, relevance, and
+novelty. Each of these can be measured by heuristic approximations such as the
+number of tokens or entities, how many of the entities literally appear in the
+document, the number of possible answers, the context size required to answer
+the question, the semantic overlap between the question and the document, how
+similar is the question to training data questions, the grammatical correctness,
+and syntactic complexity. Finally, the metadata analysis: e.g., the number of
+documents in the same domain, the number of documents in the same type, the
+number of documents in the same language.
+(ii) The output uncertainty can be modeled by the confidence of the LLM in
+its predicted answer, which can be estimated by PUQ methods and a variety
+of CSFs [111], which are hypothesized to capture complementary sources of
+uncertainty. Specific to the answer, the same question-document aspects return
+here, with additionally how extractive the answer is, the answer structure, and
+paraphrasing diversity.
+(iii) Feature representations of new documents, questions, and answers can be
+assessed relative to their individual and joint distance to the training distribution
+[477]. This will be quintessential for distributional shift detection.
+A failure forecaster trained to predict the performance of the LLM on all
+this information can be used to decide whether to abstain from answering,
+ask for clarifications from the model or human, ask for additional context,
+demand question rephrasing or a more clear document input, or even additional
+metadata. Ultimately, this will be useful to improve reliability and robustness
+for real-world IA-DU applications, where the risk of failure demands substantial
+control.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0212.txt b/assets/txts/pg_0212.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8214d0ee079917c29e57d16e764fc46de8fb50bf
--- /dev/null
+++ b/assets/txts/pg_0212.txt
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/assets/txts/pg_0256.txt b/assets/txts/pg_0256.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bdba609a16293924b742c098662df4848084b755
--- /dev/null
+++ b/assets/txts/pg_0256.txt
@@ -0,0 +1,46 @@
+224
+
+APPENDIX - PUQ
+
+20news and AAPD), retain the original document lengths, remapping tokens with
+a frequency lower than 3 to UNK and PAD tokens are masked throughout. For
+TextCNN 300-D embeddings are uniformly initialized upon which three different
+kernels (3,4,5) operate with 100 feature maps per kernel followed by a max
+pooling operation. For BERT we tokenize and encode using the standard BERT
+tokenizer with maximum sequence length determined per dataset [20news: 250,
+CLINC: 50, IMDB: 350 and Reuters/AAPD: 200].
+Following the MC Dropout procedure we apply dropout [431] with a rate of
+0.5 after each non-linear weights layer. We found a global weight decay rate of
+1e-4 [224, 293] to work well for TextCNN, whereas we disabled weight decay for
+BERT since it overpenalized model complexity, resulting in vanishing gradients.
+During training TextCNN, Adam optimizes cross-entropy or heteroscedastic
+loss (see Section 3.3.2.4) with a learning rate of 1e-3 for 45 epochs on batches
+of size 32. For fine-tuning BERT, we schedule the learning rate starting from
+1e-5 to 1e-6 with batch size 16 and train for 20 epochs (longer than the original
+recommendation, following [436]). We use early stopping conditioned on the
+validation loss with sufficient epochs to ensure all models are trained until
+convergence. Else the models might have learned to approximate well the mean
+of the predictive posterior distribution, but not the variance. At evaluation
+time, we estimate predictive mean and uncertainties by drawing T samples
+from the approximated predictive posterior distribution or by averaging over
+M models. We have empirically set T to 10 and for ensembles the number of
+models M to 5.
+
+B
+B.1
+
+Practical Considerations
+Take-home Summary
+
+Concretely, for a multi-class problem with a large number of classes,
+incorporating input-dependent data uncertainty improves accuracy and novelty
+detection. With high label cardinality in multi-label classification, we
+recommend ensembling for more reliable epistemic uncertainty estimation. More
+generally, we advise against using MC Dropout if the dropout rate and weight
+regularization are not fine-tuned for the problem at hand, drawing parallels to
+dropout probability rates adaptively learned with Concrete Dropout.
+Hyperparameter considerations We reiterate important hyperparameters
+and reasonable defaults for text classification tasks similar to our benchmark
+setup and applications of the above.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0257.txt b/assets/txts/pg_0257.txt
new file mode 100644
index 0000000000000000000000000000000000000000..da614e414be9f7b9564e12f10e36bb61c4248a67
--- /dev/null
+++ b/assets/txts/pg_0257.txt
@@ -0,0 +1,63 @@
+PRACTICAL CONSIDERATIONS
+
+225
+
+• Dropout rate p: the original work suggested a fixed binary rate (p=0.5),
+whereas our experiments indicate different rates are more applicable per
+dataset. It is best to cross-validate layer-wise dropout probabilities for
+any real-world application, where impossible it warrants the low effort of
+incorporating Concrete Dropout, consequently reducing experimentation
+time.
+• Weight decay L2: best to start with small values [1e-6 - 1e-4] and finetune accordingly. Take note to not apply global weight decay in case of
+pretrained weights, which already have high weight magnitudes, possibly
+impeding learning.
+• MC Dropout T : a small number (T =10) of stochastic samples suffices,
+if large number of classes, scale sub-linearly with K. T also applies to
+the number of samples drawn to calculate heteroscedastic loss, so beware
+increasing to too large values since it affects training compute.
+• Ensemble size M : a total of (M =5) ensemble models is plenty, certainly
+when combining with fine-tuned dropout rate at the individual model
+level.
+
+B.2
+
+Compute vs. Performance Trade-off
+
+Next to performance, practitioners are generally concerned with computational
+and memory costs. [462] present similar concerns in the benchmarking of
+uncertainty methods. Considering the cost of compute vs. storage, each
+uncertainty method impacts both differently. Following [348], we present
+computational and memory costs for evaluated methods symbolically (BigO), with m flops or storage for a trained model, l represents flops or storage for
+the last layer, T denotes sampling or replications, and ι GP inducing points.
+Table A.1. Compute and storage costs in Big-O notation [348] for uncertainty methods.
+
+Method
+Baseline
+MC (Concrete) Dropout
+Heteroscedastic
+Deep Ensemble
+cSGMCMC
+SNGP
+
+Compute/N
+m
+mT
+m + l(T − 1)
+mT
+m
+m + ι2
+
+Storage
+m
+m
+m(+l)
+mT
+mT
+m
+
+Our experiments were carried out on a system with a Intel Core i7-10750H 2.6
+GHz CPU and NVIDIA GeForce RTX 2070 Max-Q GPU.
+Additionally, we provide an informative table with training (Table A.2) and test
+(Table A.3) timings provided over all single models on CLINC-OOS.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0258.txt b/assets/txts/pg_0258.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f0f49e39dac023a41aed4d66ac204d7546c074f
--- /dev/null
+++ b/assets/txts/pg_0258.txt
@@ -0,0 +1,217 @@
+226
+
+APPENDIX - PUQ
+
+Table A.2. CLINC-OOS models with training timings (in seconds) per epoch and
+total running time.
+methods
+
+architecture
+
+Unregularized
+Regularized
+Heteroscedastic
+Concrete Dropout
+Heteroscedastic Concrete Dropout
+Unregularized
+Regularized
+Heteroscedastic
+Concrete Dropout
+Heteroscedastic Concrete Dropout
+
+TextCNN
+TextCNN
+TextCNN
+TextCNN
+TextCNN
+BERT
+BERT
+BERT
+BERT
+BERT
+
+train time/epoch
+
+epoch finished
+
+train runtime
+
+32
+32
+59
+35
+58
+420
+691
+710
+679
+707
+
+8
+28
+17
+12
+10
+5
+11
+16
+9
+16
+
+256
+896
+1003
+420
+580
+2100
+7601
+11360
+6111
+11312
+
+Table A.3. CLINC-OOS models with inference timings presented in unit time for how
+many batches or samples can be processed in 1 second wall-clock time over CPU and
+GPU. For the short sequences of CLINC, both models allow a batch size of 32.
+architecture
+
+method
+
+TextCNN
+TextCNN
+TextCNN
+TextCNN
+TextCNN
+TextCNN
+TextCNN
+TextCNN
+TextCNN
+BERT
+BERT
+BERT
+BERT
+BERT
+BERT
+BERT
+BERT
+BERT
+
+Unregularized
+Regularized
+MC Dropout
+Heteroscedastic
+MC Heteroscedastic
+Concrete Dropout
+MC Concrete Dropout
+Heteroscedastic Concrete Dropout
+MC Heteroscedastic Concrete Dropout
+Unregularized
+Regularized
+MC Dropout
+Heteroscedastic
+MC Heteroscedastic
+Concrete Dropout
+MC Concrete Dropout
+Heteroscedastic Concrete Dropout
+MC Heteroscedastic Concrete Dropout
+
+C
+C.1
+
+# batch (gpu)
+
+# sample (gpu)
+
+# batch (cpu)
+
+# sample (cpu)
+
+59.0
+66.0
+53.0
+693.0
+47.0
+66.0
+48.0
+756.0
+48.0
+6.0
+9.0
+0.9
+10.0
+1.0
+7.0
+1.0
+6.0
+0.9
+
+1891
+2134
+1708
+22176
+1525
+2130
+1541
+24205
+1561
+223
+306
+28
+325
+31
+245
+30
+218
+30
+
+63.0
+60.0
+32.0
+482.0
+38.0
+40.0
+25.0
+318.0
+27.0
+0.8
+0.8
+0.1
+0.8
+0.1
+0.9
+0.1
+0.9
+0.1
+
+2043
+1922
+1050
+15444
+1216
+1293
+827
+10197
+874
+25
+26
+2
+26
+2
+27
+2
+27
+2
+
+Detailed Experiment Results
+Zoom-in Benchmark Evidence
+
+In this Subsection we report additional evidence in support of our results, which
+did not suit the main manuscript.
+
+C.2
+
+Absolute Benchmark Results
+
+Next to reporting critical differences to analyze the relative performance of
+uncertainty methods, we also report results as summary statistics, following the
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0259.txt b/assets/txts/pg_0259.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0f666d9cdde98b0029b89c19c4610ebc1b8b23c3
--- /dev/null
+++ b/assets/txts/pg_0259.txt
@@ -0,0 +1,22 @@
+DETAILED EXPERIMENT RESULTS
+
+227
+
+Figure A.1. Comparison with NLL(↓) for dataset-specific differences in method
+performance.
+
+methodology of [462]. Firstly, we report performance averaged over both runs
+and datasets, with the standard deviation over datasets. We indicate the best
+mean performance in bold. For various metrics the standard deviation is very
+large, which shows that the average over datasets for our benchmark would be
+a poor measure of central tendency. Since we benchmark on three multiclass
+and two multilabel datasets, any aggregate would be biased towards multiclass
+performance, hence why we specifically opted for rank and critical difference to
+analyze relative performance of each method.
+Additionally, we compute the performance averaged over datasets, with the
+standard deviation over multiple runs for all individual models. All raw model
+results are available at https://github.com/Jordy-VL/uncertainty-bench/
+tree/main/experiments/raw_results. We refer to the original paper for the
+larger detail tables with results averaged over datasets and runs.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0260.txt b/assets/txts/pg_0260.txt
new file mode 100644
index 0000000000000000000000000000000000000000..845da8b3c9fa80ab0e06afd2010d995c3304ee79
--- /dev/null
+++ b/assets/txts/pg_0260.txt
@@ -0,0 +1,10 @@
+228
+
+APPENDIX - PUQ
+
+Figure A.2. We report the Pearson Correlation Coefficient (PCC) between uncertainty
+values and binary variable ID-OOD for Amazon product review datasets. A higher
+absolute correlation score points to stronger association of uncertainty and out-ofdomain detection. *Model Uncertainty (MU), Data Uncertainty (DU), Mutual
+Information (MI).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0261.txt b/assets/txts/pg_0261.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6545c653c628da666d44bbd65205a2c0460ceb1d
--- /dev/null
+++ b/assets/txts/pg_0261.txt
@@ -0,0 +1,50 @@
+DETAILED EXPERIMENT RESULTS
+
+229
+
+(a) Heteroscedastic Ensemble -
+
+(b) MC Ensemble - S
+
+(c) Deep Ensemble Regularized
+- MI
+
+(e) Deep Ensemble - MU
+
+(f) MC Dropout - MU
+
+(h) Concrete Dropout - S
+
+(i) MC Concrete Dropout - MU
+
+(k) Concrete Dropout Ensem-
+
+(l) Deep Ensemble - H
+
+H
+
+(d) Heteroscedastic CD Ensemble - S
+
+(g) Heteroscedastic Ensemble H
+
+(j) MC CD Ensemble - H
+
+ble - S
+
+(m) Deep Ensemble - MU
+
+(n) Heteroscedastic Ensemble -
+
+(o) MC Dropout - MI
+
+MI
+
+Figure A.3. A selection of most interesting Gaussian kernel density plots over
+(abbreviated) model setup metrics evaluated on all datasets in row order 20news
+(a-c), CLINC150 (d-f), imdb (g-i), Reuters (j-l), AAPD (m-o). Each plot captures
+probabilistic density over correct ID (green), incorrect ID (red) and OOD (purple).
+From left to right, we have selected a high rank, middle rank, and low-rank method and
+uncertainty quantity combination. The density estimates demonstrate clear empirical
+difference over all datasets for various uncertainty quantities.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0262.txt b/assets/txts/pg_0262.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9a71a115fe6c9f31bce80194223630427254210c
--- /dev/null
+++ b/assets/txts/pg_0262.txt
@@ -0,0 +1,28 @@
+Appendix B
+
+Appendix - BDPC
+A
+
+Existing DC Datasets
+
+As the datasets from Table 2 did not satisfy large-scale benchmarking multipage
+DC benchmarking requirements, we discuss them in supplementary for interested
+readers.
+Tobacco-3482 [232] is another subset of IIT-CDIP with fewer samples and a
+smaller label set than RVL-CDIP.
+Tobacco-800 [553] has been used for page stream segmentation ([494], similarly
+defined as in [328]) as it contains consecutively numbered multipage business
+documents.
+NIST The NIST Structured Forms Database [98] consists of 5,590 binary
+synthesized documents from 20 different classes of tax forms.
+MARG The MARG (Medical Article Records Groundtruth) database [290] is
+a layout-based classification benchmark containing 1553 documents which are
+mainly the first pages of medical journals.
+TAB [328] is a recently introduced page stream segmentation dataset targeting
+binary classification to detect document boundaries on multipage streams. It
+consists of a sample of 44,769 PDF documents from the Truth Tobacco Industry
+Documents (TTID) archives.
+
+230
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0263.txt b/assets/txts/pg_0263.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bc19247d48c441c05c55f3be64c4cfaae9abfb83
--- /dev/null
+++ b/assets/txts/pg_0263.txt
@@ -0,0 +1,19 @@
+VISUALIZATION OF PROPOSED DC DATASETS
+
+B
+
+231
+
+Visualization of Proposed DC Datasets
+
+As we have contributed two novel datasets consisting of multipage documents in
+PDF format, adding visualizations is non-trivial. The datasets are hosted at the
+HuggingFace Hub (https://huggingface.co/datasets/bdpc), for which at
+the time of submission, the dataset viewer does not support PDF data. Rather
+than adding examples in the manuscript, which is tedious for PDF documents
+with multiple pages, we have built an interactive app (https://huggingface.
+co/spaces/jordyvl/viz_bdpc). This allows for the visualization of samples
+from the proposed datasets, with an additional filter on the labels, whereas
+both datasets follow the original RVL-CDIP label taxonomy.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0264.txt b/assets/txts/pg_0264.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ce6702adabaab08e00c1ce03f4625d7dd4643f92
--- /dev/null
+++ b/assets/txts/pg_0264.txt
@@ -0,0 +1,35 @@
+Appendix C
+
+Appendix - DUDE
+A
+
+Baseline Experiments Setup
+
+In this Section, we describe the implementation details1 for the architectures
+and inference methods used in our benchmark.
+
+A.1
+
+Hyperparameter Defaults
+
+Refer to Table C.1.
+
+A.2
+
+Generative LLM Prompt Fine-tuning
+
+The performance of GPT3.5 models was assessed in two settings: 0-shot and
+4-shot. In the 0-shot setting, the prompt included instructions similar to those
+provided to annotators to teach them how to annotate. In the 4-shot setting, the
+prompt was enhanced with the content of a single document from the training
+set along with four questions of different types (extractive, abstractive, list, and
+not answerable) to better gauge the models’ abilities.
+The 0-shot prompt is analogous to the 4-shot prompt, but the key distinction is
+that it lacks the first document and the example question-and-answer pairs.
+1 Main
+
+framework used: https://github.com/rubenpt91/MP-DocVQA-Framework
+
+232
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0265.txt b/assets/txts/pg_0265.txt
new file mode 100644
index 0000000000000000000000000000000000000000..22abbcbe88c7edbcd62d199740d98b944b61ab98
--- /dev/null
+++ b/assets/txts/pg_0265.txt
@@ -0,0 +1,111 @@
+BASELINE EXPERIMENTS SETUP
+
+233
+
+Hyper-Parameter
+
+T5
+
+T5+2D
+
+HiVT5
+
+Epochs
+Warm-up
+(iterations)
+Optimizer
+Gradient acc.
+Lower case
+Max. Seq. Length
+Generation
+(Max. Tokens)
+Batch size
+Learning rate
+Training time
+(per epoch)
+
+10
+
+10
+
+10
+
+1000
+
+250
+
+1000
+
+Adam, AdamW
+False
+True
+512, 8192
+
+Adafactor
+8
+True
+512, 8192
+
+Adam
+False
+True
+20480
+
+100
+
+100
+
+50
+
+3
+1E-04, 2E-04
+
+8
+2E-04
+
+1
+2E-04
+
+1h, 10h
+
+1.5h, 5h
+
+10h
+
+TITAN RTX,
+A100
+
+A100
+(80GB)
+
+TITAN RTX
+(24GB)
+
+GPU Hardware
+
+Table C.1. Hyperparameters used for fine-tuning T5, T5-2D and HiVT5 on DUDE.
+When two values are placed in a single column, they refer to the model’s versions with
+512 and 8192 input sequence length, respectively.
+
+For the inference process, we utilized the prompt completion default settings
+outlined in the OpenAI documentation, with the exception of the temperature
+parameter, which was lowered to a value of 0.0. This adjustment was made
+to ensure that the output would be more deterministic and focused, with less
+emphasis on generating creative variations.
+Only after our prompting experiments had been completed, we realized the
+opportunity to assess confidence estimation using chained prompts (Please give
+a confidence between 0 and 1 about how certain you are this is the answer.) as
+in [219]. Since we did not save our dialogue states and considered the expenses,
+we leave this for future work.
+
+A.3
+
+Confidence Estimation
+
+This Subsection details confidence scoring functions for the baselines, as this is
+not reported in standard practice.
+We define confidence as the predicted probability of the top-1 prediction, often
+arising as the largest value from softmax normalization of logits from a final
+model layer (head).
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0266.txt b/assets/txts/pg_0266.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c57c8e0899bf055b024481f2ddfee9e6b3b27a
--- /dev/null
+++ b/assets/txts/pg_0266.txt
@@ -0,0 +1,61 @@
+234
+
+APPENDIX - DUDE
+
+Encoder-based models will output logits for all possible start and end positions
+of the answer within the provided context. While the predicted answer of such
+a span prediction architecture will come from the highest valid (no negative
+span) combination of the sum of a start and end logit, the predicted answer
+confidence can be obtained by the following procedure (BS: batch size and S:
+sequence length):
+% # Standard span prediction forward call
+% outputs = model(**inputs, start_positions=start_positions,
+,→
+end_positions=end_positions)
+%
+%
+%
+%
+%
+
+# Assumes masking all padding and special tokens after softmax with 0
+start = outputs.start_logits.softmax(dim=1)
+.unsqueeze(dim=0).unsqueeze(dim=-1) #1 x BS x S x 1
+end = outputs.end_logits.softmax(dim=1)
+.unsqueeze(dim=0).unsqueeze(dim=1) #1 x BS x 1 x S
+
+% # Compute the probability of each valid (end < start) start, end pair
+% candidate_matrix = torch.matmul(start, end).triu().detach().numpy() # 1 x BS x
+,→
+S x S
+# Obtain highest scoring candidate span by multi-index argmax
+flat_probs = candidate_matrix.reshape((1, -1)) # BS x S*S
+batch_idx, start_idx, end_idx = np.unravel_index(np.argmax(flat_probs, 1),
+,→
+candidate_matrix.shape)[1:]
+batch_answer_confs = candidate_matrix[0, batch_idx, start_idx, end_idx]
+
+Decoder-based models are not restricted to spans and can output an arbitrary,
+though often controllable, amount of text tokens, indicated as S 0 . The logits
+at the final layer take the shape of BS × S 0 × V , where V is the tokenizer’s
+vocabulary size (32.1K for T5-base). The following confidence estimation
+procedure is applied for decoder outputs:
+# Standard decoder-based greedy forward pass (without labels)
+outputs = model.generate(**input_ids, output_scores=True,
+,→
+return_dict_in_generate=True)
+% # BS x S' x V, dropping EOS token and applying softmax + argmax per token
+% batch_logits = torch.stack(outputs.scores, dim=1)[:, :-1, :]
+% decoder_outputs_confs = torch.amax(batch_logits.softmax(-1), 2)
+% # Remove padding from batching decoder output of variable sizes
+% decoder_outputs_confs_masked = torch.where(
+%
+outputs.sequences[:, 1:-1] > 0,
+%
+decoder_outputs_confs,
+%
+torch.ones_like(decoder_outputs_confs))
+# Multiply probability over tokens
+batch_answer_confs = decoder_outputs_confs_masked.prod(1)
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0267.txt b/assets/txts/pg_0267.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb1a96adf3746f044adc6a7f8330fe72c7553cb9
--- /dev/null
+++ b/assets/txts/pg_0267.txt
@@ -0,0 +1,24 @@
+QUALITATIVE EXAMPLES
+
+A.4
+
+235
+
+Evaluation
+
+All metric implementations (ANLS, ECE, AURC) are made available as
+a standalone repository. Additionally, we provide an online service where
+researchers can evaluate their methods against a blind (questions-only) test
+dataset. General metric descriptions are provided in Section 2.2.3 with additional
+implementation details and motivated design choices. While ANLS can account
+for shortcomings of OCR and formatting issues, evaluation of generated text is
+notoriously complex [377] and requires more research.
+
+B
+
+Qualitative Examples
+
+As is customary, we provide some interesting, handpicked test set examples
+with predictions from some of the baselines in our study.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0268.txt b/assets/txts/pg_0268.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95417ae1da4d55ceaa154043fd8fffc8c3aa6b72
--- /dev/null
+++ b/assets/txts/pg_0268.txt
@@ -0,0 +1,62 @@
+236
+
+APPENDIX - DUDE
+
+Low complexity. Where the document has been printed?
+Simple, extractive question, plain-text evidence.
+
+Source
+
+Answer
+
+Ground truth
+Human
+
+New Delhi, India
+India
+
+T5
+ChatGPT
+
+IS : 9304 - 1979
+The document does
+not mention where
+it has been printed.
+Bela Pack n Print.
+New Delhi, India
+New Delhi, India
+Page 1
+new delhi, india
+
+GPT3
+T5-2D
+HiVT5
+Longformer
+
+ANLS
+
+Conf.
+
+0.0
+
+—
+
+0.0
+0.0
+
+0.56
+—
+
+0.0
+
+—
+
+1.0
+0.0
+1.0
+
+0.09
+0.18
+0.72
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0269.txt b/assets/txts/pg_0269.txt
new file mode 100644
index 0000000000000000000000000000000000000000..012c1812417e2eca95882487d4cbbb8b423dd96b
--- /dev/null
+++ b/assets/txts/pg_0269.txt
@@ -0,0 +1,108 @@
+QUALITATIVE EXAMPLES
+
+237
+
+High complexity. Is there any redacted section on the document?
+Abstractive question that requires knowledge about possible document elements.
+
+Source
+
+Answer
+
+Ground truth
+Human
+
+No
+No
+
+ANLS
+
+Conf.
+
+1.0
+
+—
+
+T5
+ChatGPT
+GPT3
+T5-2D
+HiVT5
+LayoutLMv3
+
+yes
+[Not-answerable]
+[Not-answerable]
+No
+Yes
+approved for release
+
+0.0
+0.0
+0.0
+1.0
+0.0
+0.0
+
+0.17
+—
+—
+0.43
+0.55
+0.01
+
+Requires arithmetic. What is the difference between how much Operator II
+and Operator III makes per hour?
+The question requires table comprehension, determining relevant values, dividing
+extracted integers, and correcting the subject-verb agreement.
+
+Source
+
+Answer
+
+Ground truth
+Human
+
+$5
+$5
+
+T5
+ChatGPT
+GPT3
+
+200
+$5 per hour.
+Operator II ($17/hr)
+|
+Operator
+III
+($22/hr)
+[Not-answerable]
+[Not-answerable]
+
+T5-2D
+HiVT5
+
+ANLS
+
+Conf.
+
+1.0
+
+—
+
+0.0
+0.0
+0.0
+
+0.28
+—
+—
+
+0.0
+0.0
+
+0.31
+0.15
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0270.txt b/assets/txts/pg_0270.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ce0a31bd405cbab0981b15cc6ee12a4653fc5216
--- /dev/null
+++ b/assets/txts/pg_0270.txt
@@ -0,0 +1,62 @@
+238
+
+APPENDIX - DUDE
+
+Visual evidence (chart). What is the maximum percentage of the blue graph
+line on page 8?
+A highly demanding question that requires simultaneous competency of visual
+comprehension (locating chart and line color), navigating through layout
+(determining adequate page), and numerical comparison (deciding on the highest
+value).
+
+Source
+
+Answer
+
+Ground truth
+Human
+
+75%
+75
+
+ANLS
+
+Conf.
+
+0.7
+
+—
+
+T5
+ChatGPT
+GPT3
+T5-2D
+HiVT5
+BigBird
+LayoutLMv3
+
+76
+[Not-answerable]
+76%
+32.0
+45%
+32
+80%
+
+0.0
+0.0
+0.7
+0.0
+0.7
+0.0
+0.0
+
+0.25
+—
+—
+0.00
+0.05
+0.47
+0.15
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0271.txt b/assets/txts/pg_0271.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1367c584c4473e779406cc759a5b40f3c63cf680
--- /dev/null
+++ b/assets/txts/pg_0271.txt
@@ -0,0 +1,58 @@
+QUALITATIVE EXAMPLES
+
+239
+
+Visual evidence (handwriting).
+
+What is the handwritten date on page 1?
+
+The question requires visual comprehension (recognition of handwriting) and
+layout navigation (determining the adequate page).
+
+Source
+
+Answer
+
+Ground truth
+Human
+
+13-XII-50
+13-XII-50
+
+ANLS
+
+Conf.
+
+1.0
+
+—
+
+T5
+ChatGPT
+GPT3
+T5-2D
+HiVT5
+BERTQA
+
+1977-01-01
+[Not-answerable]
+15 December 1950
+1950-12-15
+1977-07-01
+2006 / 1
+
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+
+0.24
+—
+—
+0.24
+0.11
+0.5
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0272.txt b/assets/txts/pg_0272.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2d4c3aad835749104d520a8b87b3780ebd9a7a8e
--- /dev/null
+++ b/assets/txts/pg_0272.txt
@@ -0,0 +1,51 @@
+240
+
+APPENDIX - DUDE
+
+Requires counting. How many pages have a signature?
+The question requires visual comprehension (recognition of signature), knowledge
+about layout, and counting.
+
+Source
+
+Answer
+
+ANLS
+
+Conf.
+
+Ground truth
+Human
+
+2
+2
+
+1.0
+
+—
+
+T5
+ChatGPT
+GPT3
+T5-2D
+HiVT5
+
+1
+4
+[Not-answerable]
+4
+4
+
+0.0
+0.0
+0.0
+0.0
+0.0
+
+0.01
+—
+—
+0.69
+0.41
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0273.txt b/assets/txts/pg_0273.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aa0d6fadfe95953c293fa7962933003b47a21708
--- /dev/null
+++ b/assets/txts/pg_0273.txt
@@ -0,0 +1,65 @@
+QUALITATIVE EXAMPLES
+
+241
+
+Visual evidence (map), multi-hop. Which states don’t have any marijuana
+laws?
+The multi-hop question requires visually comprehending the map and linking
+knowledge from its legend with depicted regions.
+
+Source
+
+Answer
+
+Ground truth
+Human
+
+ID | SD | KS
+ID | SD | KS
+
+T5
+
+WA ME MT ND
+MN OR VT ID NH
+SD WI NY MA MI
+[Not-answerable]
+American Samoa
+i
+-
+
+ChatGPT
+GPT3
+T5-2D
+HiVT5
+
+B.1
+
+ANLS
+
+Conf.
+
+1.0
+
+—
+
+0.0
+
+0.28
+
+0.0
+0.0
+0.0
+0.0
+
+——0.03
+0.02
+
+Qualitative Examples - Competition
+
+We provide some interesting, hand-picked test set examples with predictions
+from the submitted competition methods.
+Low complexity. Who is the president and vice-chancellor? Despite
+the question’s relatively straightforward nature, some systems struggle with
+providing the appropriate answer. One can hypothesize it is the result of limited
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0274.txt b/assets/txts/pg_0274.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e9bdcc43c6228d5240e39e1ac8e59367a82b43ee
--- /dev/null
+++ b/assets/txts/pg_0274.txt
@@ -0,0 +1,135 @@
+242
+
+APPENDIX - DUDE
+
+context (the answer is located at the end of the document), i.e., models either
+hallucinate a value or provide a name found earlier within the document.
+Source
+
+Answer
+
+Ground truth
+Human
+
+Jack N. Lightstone
+Jack N. Lightstone
+
+ANLS
+
+Conf.
+
+1.0
+
+—
+
+T5-base
+MMT5
+UDOP+BLIP2+GPT
+HiVT5+modules
+
+James L. Turk
+james l. turk
+jack n. lightstone
+Jack N. Whiteside
+
+0.0
+0.0
+1.0
+0.6
+
+0.0
+1.0
+0.9
+0.6
+
+Requires graphical comprehension. Which is the basis for jurisdiction?
+To provide a valid answer, the model needs to comprehend the meaning of
+the form field and recognize the selected checkbox. None of the participating
+systems was able to spot the answer correctly.
+
+Source
+
+Answer
+
+Ground truth
+
+U.S. Goverment Plaintiff
+U.S. Goverment Plaintiff
+
+Human
+T5-base
+MMT5
+
+Declaration of taking
+united states district
+court
+
+HiVT5+modules
+UDOP+BLIP2+GPT public purpose
+
+ANLS
+
+Conf.
+
+1.0
+
+—
+
+0.0
+0.0
+
+0.1
+1.0
+
+0.0
+0.0
+
+1.0
+0.4
+
+Requires comparison. In which year does the Net Requirement exceed
+25,000? The question requires comprehending a multipage table and spotting if
+any values fulfill the posed condition. Some of the models resort to plausible
+answers (one of the three dates that the document covers), whereas others
+correctly decide there is no value exceeding the provided amount.
+Source
+
+Answer
+
+Ground truth
+Human
+
+[Unanswerable]
+[Unanswerable]
+
+ANLS
+
+Conf.
+
+1.0
+
+—
+
+T5-base
+MMT5
+UDOP+BLIP2+GPT
+HiVT5+modules
+
+[Unanswerable]
+2018
+[Unanswerable]
+2017
+
+1.0
+0.0
+1.0
+0.0
+
+0.2
+1.0
+1.0
+0.8
+
+Requires arithmetic. What is the difference between how much Operator II
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0275.txt b/assets/txts/pg_0275.txt
new file mode 100644
index 0000000000000000000000000000000000000000..46773fdaa7b8796d44596d3b4448015ae817d09d
--- /dev/null
+++ b/assets/txts/pg_0275.txt
@@ -0,0 +1,92 @@
+QUALITATIVE EXAMPLES
+
+243
+
+and Operator III make per hour? The question requires table comprehension,
+determining relevant values, and dividing extracted integers. None of the
+participating models was able to fulfill this requirement.
+Source
+
+Answer
+
+Ground truth
+Human
+
+$5
+$5
+
+ANLS
+
+Conf.
+
+1.0
+
+—
+
+T5-base
+MMT5
+UDOP+BLIP2+GPT
+HiVT5+modules
+
+$0.00
+65%
+-1.5 mile
+$5,700.00
+
+0.0
+0.0
+0.0
+0.0
+
+0.0
+1.0
+0.0
+0.4
+
+Requires counting and list output. What are the first two behavioral and
+intellectual disabilities of people with FASDs? It seems most of the models
+correctly recognized that this type of question requires a list answer but either
+failed to comprehend the question or provided a list with incorrect length
+(incomplete or with too many values).
+Source
+
+Answer
+
+Ground truth
+Human
+
+Learning disabilities | Hyperactivity
+learning disabilities
+
+T5-base
+MMT5
+
+Early embryo brain development | External Genitals
+heart beats | difficulty with attention | lung function
+| hyperactivity | problem with judgment | speech and
+language delays
+UDOP+BLIP2+GPT hyperactivity | speech and language delays
+HiVT5+modules
+HIV/AIDS
+
+ANLS
+
+Conf.
+
+0.5
+
+—
+
+0.0
+0.2
+
+0.0
+1.0
+
+0.5
+0.0
+
+0.2
+0.6
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0276.txt b/assets/txts/pg_0276.txt
new file mode 100644
index 0000000000000000000000000000000000000000..18669c134f8f4a73b51b4cc4cdd5a1ce937fb46e
--- /dev/null
+++ b/assets/txts/pg_0276.txt
@@ -0,0 +1,33 @@
+Appendix D
+
+Appendix - KDD
+A
+
+Code and Datasets
+
+The proposed KD-VDU experimentation framework is available as linked
+in the main manuscript. This includes the DIC benchmarking that is made
+fully compatible with HuggingFace transformers, even allowing arbitrary image
+classification models and (document) image datasets from HuggingFace hub.
+The DLA benchmark is built around the Detectron2 framework, with
+additional scripts for efficiency evaluation, visualization, and document data
+preparation for downstream tasks. Downstream task experiments are made
+available as a fork of the original LATIN-prompt [482] implementations with
+additional modifications (4-bit quantization, question type ANLS evaluation,
+InfographicsVQA dataloader, structure-preserving OCR respecting DLA
+tokens).
+
+B
+
+Implementation Details
+
+DIC All runs are documented with hyperparameter configuration and
+commandline arguments in a wandb project for complete transparency in
+experiment results and reproducibility.
+For RVL-CDIP, both teacher and student training is carried out for 10 epochs
+with a batch size of (32 ViT, 64 ResNet) and AdamW with weight decay 5e-4
+and a learning rate of 1e-4 with a linear warmup of 10%. For Tobacco-3482,
+the default recipe is similarly trained for 100 epochs. All experiments were
+244
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0277.txt b/assets/txts/pg_0277.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fa6479428b805acf2c46d22c97e6748892039393
--- /dev/null
+++ b/assets/txts/pg_0277.txt
@@ -0,0 +1,88 @@
+IMPLEMENTATION DETAILS
+
+245
+
+performed on a single NVIDIA GeForce RTX 3090 GPU (24GB GPU vRAM).
+For some feature-based KD methods, the batch size was necessarily lowered to 16
+due to memory constraints. KD method hyperparameters were cross-validated
+to find the best performing configuration for each method, and are listed in the
+main manuscript result tables.
+DLA In this paper, MaskRCNN detection architecture is considered with two
+different backbones (1) CNNs: ResNet50 and ResNet101 (2) Transformers: ViT
+base and ViT tiny. All the detection models are trained with Detectron2 [499]
+which uses the PyTorch deep learning library. The hyperparameters used are
+the following: (a) learning rate of 1e-4 (b) iterations 300k (c) optimizer: Adam
+(d) batch size: 16 (e) ROI heads predictions: 128 (f) NMS threshold: 0.4 (g)
+confidence threshold: 0.6 For reproducibility, we share the exact config files
+used for each experiment as part of the Supplementary,
+Teacher and student model variants Tables D.1 and D.2 indicate the
+differences between used teacher and student models in terms of parameterization
+and efficiency.
+Table D.1. Details of Vision Transformer model variants [101].
+Variants
+Tiny (T)
+Small (S)
+Base (B)
+
+Layers
+12
+12
+12
+
+Settings of D/ViT
+Width FFN Heads
+192
+768
+3
+384
+1536
+6
+768
+3072
+12
+
+#Param
+5.5M
+21.7M
+85.8M
+
+Table D.2. Details of the efficiency of model checkpoints considered in this work.
+
+Model
+microsoft/resnet-101
+microsoft/resnet-50
+google/vit-base-patch16-224
+microsoft/dit-base
+WinKawaks/vit-small-patch16-224
+WinKawaks/vit-tiny-patch16-224
+
+GFLOPs
+15.65
+8.21
+35.15
+35.15
+9.21
+2.51
+
+GMACs
+7.8
+4.09
+17.56
+17.56
+4.6
+1.25
+
+Params (M)
+42.5
+23.51
+86.39
+85.81
+21.81
+5.56
+
+Downstream We extended the implementation of [482] to incorporate Llama-2
+[452] and build a similar dataloader for InfographicsVQA [310]. To enable strict
+compatibility, we used the same unified OCR format, DUE [47], for all datasets.
+This facilitated easy incorporation of DLA tokens into the OCR tokens without
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0278.txt b/assets/txts/pg_0278.txt
new file mode 100644
index 0000000000000000000000000000000000000000..87092c6b36f2544c99b9a21e07a40d75d44a07df
--- /dev/null
+++ b/assets/txts/pg_0278.txt
@@ -0,0 +1,44 @@
+246
+
+APPENDIX - KDD
+
+disrupting the logic behind the original layout-aware representation of document
+text. As it involved zero-shot evaluation, no finetuning was attempted for this
+task, and while it could be left for future work, we want to iterate that we
+sought to explore the innate ability of LLMs to ingest DLA-enriched prompts,
+and not the downstream task performance itself.
+
+C
+
+Task Definitions
+
+The definitions have been incorporated as part of the fundamentals. Here we
+will only point to details that are not included in the main manuscript.
+To place each task in the context of document inputs, we define the following
+tasks and their respective inputs with common notation. We follow notation
+established in [470] for document page inputs.
+A page p consists of an image v ∈ RC×H×W (number of channels, height,
+T
+and width, respectively) with T word tokens u = {wt }t=1 organized according
+ 1 1 2 2 T
+to a layout structure s = xt , yt , xt , yt t=1 , typically referred to as token
+bounding boxes, coming from OCR or available from a born-digital document.
+DIC As a prototypical instance of classification [472] the goal is to learn an
+estimator f : X → Y using N supervised input-output pairs (X, Y ) ∈ X × Y
+drawn i.i.d. from an unknown joint distribution P (X, Y ). In the context of
+DIC, the input space X is the set of all document images, and the output space
+Y is the set of all document classes (e.g., invoice, email, form, advertisement,
+etc.). The goal is to learn a function f that maps a document image x ∈ X to
+a document class y ∈ Y, such that f (x) = y. Covariate shift [418] occurs when
+the input distribution P (X) changes between the training and evaluation sets,
+but the conditional distribution P (Y |X) remains the same. Put plainly, both
+sets share the same document classes, yet the visual appearance, layout and
+content of the document images can be different. For example, RVL-CDIP [241]
+contains more modern documents with color, whereas all RVL-CDIP documents
+are greyscale.
+DLA The task of DLA can be formulated as a function that processes a
+document image input and outputs structured information about its logical
+layout elements (e.g., text blocks, headers, figures, charts, plots, tables). Let
+DLA(x) represent the output predictions of the DLA process as a set of tuples,
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0279.txt b/assets/txts/pg_0279.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1523019ef36b4ee99c269c699be759260653b535
--- /dev/null
+++ b/assets/txts/pg_0279.txt
@@ -0,0 +1,98 @@
+ADDITIONAL EXPERIMENT RESULTS
+
+247
+
+where each tuple (bj , cj , pj ) represents one of J detected logical layout element.
+J
+
+DLA(x) = {(bj , cj , mj )}j=1
+
+(D.1)
+
+For each, bj denotes the bounding box for the j-th detected element, defined
+as (xj , yj , wj , hj ) (in the popular COCO format). cj is the class label for the
+j-th element, indicating its object category. mj is a set of additional properties
+or information (metadata attributes, predicted scores, considered optional)
+associated with the j-th element, which can vary depending on the type and
+context of the layout components.
+Zero-shot Document Visual Question Answering Given a document d and
+a question q, the goal of zero-shot DocVQA is to predict the answer a to the
+question q from the document, assuming a single document image for simplicity.
+Following the text-only LLM approach in [482], each document image requires
+to be translated to text, either from OCR or from a born-digital document, and
+the question is translated to a prompt p. The prompt p is a sequence of tokens
+that is fed to the LLM model, together with a potential task instruction, and
+the document image text D, which is structured following a heuristic procedure
+operating on the text tokens (T ) and respective bounding boxes (see Table 6.2).
+
+D
+
+Additional Experiment Results
+
+Table D.3. Results of different KD strategies benchmarked for ResNets applied on the
+RVL-CDIP dataset.
+Dataset
+RVL-CDIP
+RVL-CDIP1k
+RVL-CDIP1k
+RVL-CDIP1k
+RVL-CDIP1k
+RVL-CDIP1k
+RVL-CDIP1k
+
+Teacher
+ResNet-101
+–
+ResNet-101
+ResNet-101
+ResNet-101
+ResNet-101
+ResNet-101
+ResNet-101
+
+Student
+–
+ResNet-50
+ResNet-50
+
+Method
+Baseline
+Baseline
+Vanilla [τ = 2.5, α = 0.5]
+NKD [τ = 1, γ = 1.5]
+MSE
+SimKD [∅ projector]
+SimKD [CNN]
+FitNet [middle]
+
+ACC
+0.819
+0.783
+0.783
+0.785
+0.786
+0.769
+0.797
+0.758
+
+AURC
+0.043
+0.059
+0.059
+0.063
+0.058
+0.067
+0.053
+0.087
+
+ECE
+0.017
+0.039
+0.039
+0.073
+0.032
+0.025
+0.023
+0.178
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0280.txt b/assets/txts/pg_0280.txt
new file mode 100644
index 0000000000000000000000000000000000000000..699e666f5ab8f086fd4885d668f634ee11f4122d
--- /dev/null
+++ b/assets/txts/pg_0280.txt
@@ -0,0 +1,116 @@
+248
+
+APPENDIX - KDD
+
+Table D.4. Results of different KD strategies benchmarked for ResNets applied on the
+Tobacco-3482 dataset.
+
+Student
+–
+ResNet-50
+
+Method
+Teacher
+CE
+CE+KD
+NKD
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet
+
+ACC
+0.445
+0.552
+0.667
+0.436
+0.399
+0.176
+0.314
+0.577
+
+ECE
+0.102
+0.096
+0.127
+0.076
+0.083
+0.250
+0.103
+0.085
+
+AURC
+0.360
+0.256
+0.149
+0.330
+0.379
+0.768
+0.429
+0.219
+
+Table D.5. Results of different KD strategies benchmarked for ViT-B applied on the
+Tobacco-3482 datasets.
+
+Student
+ViT-S
+
+ViT-T
+
+Method
+Teacher
+CE
+CE+KD
+NKD
+MSE
+SimKD [CNN]
+FitNet
+NKD
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet
+
+ACC
+0.876
+0.783
+0.814
+0.803
+0.807
+0.836
+0.821
+0.792
+0.798
+0.811
+0.810
+0.805
+
+ECE
+0.082
+0.096
+0.072
+0.094
+0.161
+0.125
+0.151
+0.064
+0.198
+0.599
+0.135
+0.160
+
+AURC
+0.040
+0.071
+0.063
+0.066
+0.062
+0.072
+0.059
+0.069
+0.074
+0.065
+0.081
+0.070
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0281.txt b/assets/txts/pg_0281.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c041f39cf9c321030da2b3eb2103d317d2257ca6
--- /dev/null
+++ b/assets/txts/pg_0281.txt
@@ -0,0 +1,149 @@
+ADDITIONAL EXPERIMENT RESULTS
+
+249
+
+Table D.6. Results of different KD strategies benchmarked for DiT-B applied on the
+Tobacco-3482 dataset.
+
+Student
+ViT-S
+
+ViT-T
+
+Method
+Teacher
+CE
+CE+KD
+NKD
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet
+CE
+CE+KD
+NKD
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet
+
+ACC
+0.916
+0.820
+0.825
+0.813
+0.818
+0.829
+0.810
+0.827
+0.810
+0.816
+0.807
+0.811
+0.778
+0.783
+0.793
+
+ECE
+0.109
+0.081
+0.086
+0.101
+0.090
+0.153
+0.144
+0.152
+0.066
+0.078
+0.087
+0.072
+0.162
+0.187
+0.168
+
+AURC
+0.020
+0.059
+0.064
+0.055
+0.063
+0.056
+0.062
+0.067
+0.065
+0.065
+0.063
+0.061
+0.093
+0.079
+0.077
+
+Table D.7. Results for DLA-KD experiments on PRImA dataset.
+
+Teacher
+
+Student
+
+Method
+
+mAP
+
+Vit-B
+Resnet-101
+-
+
+ViT-T
+Resnet-50
+
+Teacher
+Teacher
+Baseline
+Baseline
+
+36.01
+38.34
+32.64
+35.61
+
+Resnet-101
+
+Resnet-50
+
+Vit-B
+
+ViT-T
+
+SimKD
+ReviewKD
+SimKD
+ReviewKD
+
+35.00
+34.31
+32.05
+31.94
+
+D.1
+
+Tobacco-3482 Results
+
+D.2
+
+PRImA Results
+
+D.3
+
+RVL-CDIP-N Results
+
+D.4
+
+Downstream DocVQA Results
+
+D.5
+
+Ablation Experiments
+
+The experiments with random student weight initialization (Tables D.12
+and D.13) show that ViTs suffer more from student weight initialization, which is
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0282.txt b/assets/txts/pg_0282.txt
new file mode 100644
index 0000000000000000000000000000000000000000..68df93f8252c502bffc8e41c9667118846f2dfd0
--- /dev/null
+++ b/assets/txts/pg_0282.txt
@@ -0,0 +1,61 @@
+250
+
+APPENDIX - KDD
+
+Table D.8. Evaluation including relative runtime of KD methods on RVL-CDIP-N,
+where from left-to-right results are grouped per KD strategy, per backbone, per student
+size.
+
+Table D.9. Results for KD methods when averaged over architectures and student
+sizes on RVL-CDIP-N.
+
+KD method
+
+ACC
+
+ECE
+
+AURC
+
+Teacher
+CE
+CE+KD
+NKD
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet
+
+0.611
+0.573
+0.519
+0.524
+0.490
+0.613
+0.629
+0.534
+
+0.120
+0.119
+0.184
+0.137
+0.205
+0.202
+0.273
+0.281
+
+0.152
+0.215
+0.298
+0.259
+0.308
+0.216
+0.197
+0.246
+
+evidenced by an average accuracy of 0.5962 for ViT-S/Trand compared to 0.7675
+for R50rand . When the student initialization is not dependent on pretraining,
+NKD pops up as a performant method, showing the versatility of response-based
+methods when transfer of feature representations is harder.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0283.txt b/assets/txts/pg_0283.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d23da2310a7bd817dfbce2213500bdb92a062bff
--- /dev/null
+++ b/assets/txts/pg_0283.txt
@@ -0,0 +1,583 @@
+ADDITIONAL EXPERIMENT RESULTS
+
+251
+
+Table D.10. Validation ANLS (scaled to %) of Llama-2-7b-chat [452] on SP-DocVQA
+[309], with a KD-DLA model enriching the prompt.
+prompt
+plain
+space
+task
++DLA
+
+task_space
++DLA
+
+DLA
+
+ANLS Image/Photo Yes/No Figure/diagram Form Free_text Handwritten Layout Others Table/list
+
+Resnet-101
+Resnet-101
+Resnet-50 ReviewKD
+Resnet-50 SimKD
+Vit-B
+Vit-T
+Vit-T ReviewKD
+Vit-T SimKD
+Resnet-101
+Resnet-50
+Resnet-50 ReviewKD
+Resnet-50 SimKD
+Vit-B
+Vit-T
+Vit-T ReviewKD
+Vit-T SimKD
+
+4.3
+4.61
+57.63
+57.76
+57.55
+57.76
+57.53
+58.39
+58.65
+57.96
+58.58
+62.46
+61.86
+62.08
+62.14
+61.95
+61.2
+58.65
+61.58
+61.46
+
+4.25
+2.97
+45.38
+43.31
+44.44
+43.31
+45.45
+44.43
+44.7
+45.9
+45.09
+42.95
+41.51
+39.62
+44.09
+43.93
+44.58
+44.7
+46.25
+44.79
+
+5.36
+0.0
+51.52
+47.02
+49.4
+47.02
+51.52
+41.67
+50.3
+47.32
+49.43
+49.43
+48.24
+49.13
+42.26
+44.97
+49.13
+50.3
+46.75
+48.24
+
+1.46
+1.25
+34.97
+35.01
+34.0
+35.01
+35.28
+34.81
+36.19
+33.49
+34.92
+40.93
+40.63
+42.4
+40.39
+40.57
+40.28
+36.19
+37.84
+40.25
+
+2.69
+3.31
+67.88
+66.84
+66.99
+66.84
+67.39
+66.38
+67.65
+66.68
+67.28
+71.15
+71.12
+71.27
+70.6
+71.02
+68.95
+67.65
+69.37
+69.55
+
+8.99
+7.55
+69.71
+70.03
+68.64
+70.03
+68.73
+67.82
+68.0
+68.92
+70.64
+70.59
+69.39
+70.37
+69.69
+70.12
+68.39
+68.0
+69.27
+69.95
+
+1.74
+2.14
+53.19
+52.27
+51.97
+52.27
+52.23
+52.1
+52.49
+51.15
+52.19
+55.87
+54.56
+54.43
+53.07
+54.95
+52.81
+52.49
+53.86
+53.15
+
+6.1
+6.48
+55.51
+57.16
+56.52
+57.16
+56.71
+59.19
+59.29
+58.46
+58.44
+61.87
+61.38
+61.54
+61.8
+61.43
+61.38
+59.29
+61.5
+61.0
+
+7.72
+8.45
+55.78
+58.77
+58.23
+58.77
+56.5
+55.91
+57.03
+56.32
+57.68
+61.05
+58.62
+59.86
+60.14
+60.74
+56.44
+57.03
+58.44
+58.18
+
+1.87
+2.59
+53.81
+52.22
+52.64
+52.22
+52.2
+52.79
+52.72
+51.89
+52.82
+58.31
+57.48
+57.59
+58.29
+57.69
+56.7
+52.72
+57.63
+57.05
+
+Table D.11. Validation ANLS (scaled to %) of Llama-2-7b-chat [452] on
+InfographicsVQA [310], with a KD-DLA model enriching the prompt.
+prompt
+plain
+space
+task
++DLA
+
+task+space
++DLA
+
+DLA
+
+Resnet-50
+Resnet-101
+Resnet-50 ReviewKD
+Resnet-50 SimKD
+Vit-B
+Vit-T
+Vit-T ReviewKD
+Vit-T SimKD
+Resnet-50
+Resnet-101
+Resnet-50 ReviewKD
+Resnet-50 SimKD
+Vit-B
+Vit-T
+Vit-T ReviewKD
+Vit-T SimKD
+
+ANLS Arithmetic Comparison Counting Figure Map Multi-span Non-extractive Question span Single span Table/list Text Visual/layout
+0.81
+0.69
+29.08
+27.94
+27.86
+28.16
+27.65
+28.36
+28.32
+28.23
+28.18
+27.97
+27.14
+28.08
+28.07
+27.68
+28.05
+27.0
+28.47
+27.97
+
+0.0
+0.0
+14.15
+
+0.0
+0.0
+26.94
+
+0.23
+0.0
+11.35
+
+0.42 0.0
+0.32 0.0
+27.52 19.1
+
+0.93
+0.9
+19.79
+
+0.12
+0.0
+12.79
+
+0.64
+0.53
+48.44
+
+0.98
+0.86
+33.79
+
+1.0
+1.08
+26.17
+
+1.93
+1.55
+35.24
+
+0.47
+0.0
+26.39
+
+14.1
+12.12
+13.33
+13.79
+14.93
+15.06
+
+26.21
+24.96
+25.81
+25.78
+29.15
+28.02
+
+10.28
+11.35
+12.05
+9.95
+7.64
+9.58
+
+26.19
+26.32
+26.39
+26.16
+27.05
+27.25
+
+20.25
+18.82
+22.11
+19.53
+19.0
+19.01
+
+17.7
+18.32
+21.06
+18.78
+19.41
+17.0
+
+12.28
+11.93
+12.93
+11.97
+11.21
+11.82
+
+45.14
+44.81
+46.95
+45.95
+46.87
+45.67
+
+32.7
+32.62
+32.42
+32.17
+33.35
+33.48
+
+24.79
+24.51
+25.02
+24.31
+25.56
+25.02
+
+34.3
+33.89
+34.18
+33.8
+34.59
+34.81
+
+26.96
+25.94
+26.86
+26.31
+26.69
+28.33
+
+13.35
+14.82
+9.78
+8.12
+9.49
+9.59
+
+27.7
+26.31
+25.13
+23.78
+24.31
+24.18
+
+10.78
+9.6
+6.99
+6.27
+8.04
+8.41
+
+26.39
+26.19
+25.93
+24.68
+25.88
+25.88
+
+20.03
+18.96
+21.04
+18.67
+19.72
+18.67
+
+20.4
+18.09
+22.33
+19.26
+21.01
+21.37
+
+11.92
+12.51
+8.2
+7.0
+8.63
+9.01
+
+45.95
+45.36
+43.36
+41.95
+41.23
+42.86
+
+32.95
+32.87
+33.53
+33.03
+33.77
+33.53
+
+25.9
+24.93
+25.76
+25.93
+25.87
+26.2
+
+35.28
+34.71
+35.06
+34.07
+35.24
+35.49
+
+27.46
+30.98
+27.47
+28.48
+28.44
+27.8
+
+9.98
+9.92
+9.06
+10.89
+10.56
+
+24.45
+25.28
+23.19
+25.9
+25.54
+
+7.11
+7.83
+7.34
+5.42
+8.35
+
+25.71
+26.28
+25.81
+26.8
+26.23
+
+20.65
+19.0
+21.9
+22.23
+20.65
+
+20.87
+21.85
+18.9
+20.59
+20.34
+
+8.4
+8.82
+8.04
+8.28
+9.19
+
+43.36
+41.84
+39.82
+45.67
+44.08
+
+33.19
+33.54
+32.65
+34.24
+33.43
+
+25.51
+25.57
+23.69
+26.44
+25.04
+
+34.56
+34.6
+33.93
+35.81
+33.89
+
+27.81
+29.17
+28.33
+29.14
+30.49
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0284.txt b/assets/txts/pg_0284.txt
new file mode 100644
index 0000000000000000000000000000000000000000..63241834db57d93325e6d5d86d6c4a710c6a6be4
--- /dev/null
+++ b/assets/txts/pg_0284.txt
@@ -0,0 +1,148 @@
+252
+
+APPENDIX - KDD
+
+Table D.12. Results of different KD strategies benchmarked for ViT-B teacher with
+randomly initialized (rand) ViT students applied on the RVL-CDIP dataset.
+
+Teacher
+ViT-B_rand
+–
+ViT-B
+ViT-B
+ViT-B
+ViT-B
+ViT-B
+ViT-B
+ViT-B
+ViT-B
+ViT-B
+ViT-B
+ViT-B
+
+Student
+–
+ViT-Srand
+
+ViT-Trand
+
+Method
+Baseline
+Vanilla [τ = 2.5, α = 0.5]
+NKD [τ = 1, γ = 1.5]
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet [middle]
+Vanilla [τ = 2.5, α =]
+NKD [τ = 1, γ = 1.5]
+MSE
+SimKD [CLS+MLP]
+SimKD [CNN]
+FitNet [middle]
+
+ACC
+0.540
+0.613
+0.579
+0.626
+0.609
+0.681
+0.628
+0.560
+0.552
+0.579
+0.582
+0.663
+0.570
+
+AURC
+0.235
+0.175
+0.193
+0.159
+0.181
+0.181
+0.161
+0.212
+0.215
+0.198
+0.199
+0.205
+0.207
+
+ECE
+0.078
+0.220
+0.046
+0.203
+0.120
+0.297
+0.155
+0.141
+0.025
+0.232
+0.196
+0.316
+0.143
+
+Table D.13. Results of different KD strategies benchmarked for ResNet-101 teacher
+with randomly initialized (rand) ResNet-50 students applied on the RVL-CDIP
+dataset.
+
+Teacher
+R101_rand
+–
+R101
+R101
+R101
+R101
+R101
+R101
+
+Student
+–
+R50
+R50rand
+
+Method
+Baseline
+Baseline
+Vanilla [τ = 2.5, α = 0.5]
+NKD [τ = 1, γ = 1.5]
+MSE
+SimKD [CLS+MLP]
+SimKD [∅ projector]
+FitNet [middle]
+
+ACC
+
+AURC
+
+ECE
+
+0.769
+0.760
+0.770
+0.765
+0.766
+0.774
+0.760
+
+0.015
+0.017
+0.051
+0.022
+0.037
+0.025
+0.177
+
+0.066
+0.071
+0.072
+0.068
+0.068
+0.063
+0.078
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0285.txt b/assets/txts/pg_0285.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5767abffae86f6166b213e40c39efe1ae240c78d
--- /dev/null
+++ b/assets/txts/pg_0285.txt
@@ -0,0 +1,17 @@
+Curriculum
+JORDY VAN LANDEGHEM received an M.A. degree in Linguistics in 2015 and
+an M.Sc. degree in artificial intelligence in 2017, both from KU Leuven, where
+he is currently pursuing a Ph.D. degree in computer science. He completed
+research internships at Oracle and Nuance Communications, and is currently
+the lead AI Researcher at Contract.fit, a European SaaS start-up building
+intelligent document processing solutions.
+His industrial Ph.D. project entitled “Intelligent Automation for AI-Driven
+Document Understanding” focuses on the fundamentals of probabilistic deep
+learning, emphasizing calibration, uncertainty quantification, and out-ofdistribution robustness to obtain more reliable document intelligence systems.
+Recently, he spearheaded the Document UnderstanDing of Everything (DUDE)
+project and the ensuing ICDAR 2023 competition, with more research published
+on reliable and scalable document understanding.
+
+253
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0286.txt b/assets/txts/pg_0286.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8214d0ee079917c29e57d16e764fc46de8fb50bf
--- /dev/null
+++ b/assets/txts/pg_0286.txt
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/assets/txts/pg_0287.txt b/assets/txts/pg_0287.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ebe343634ebd0ff657f61f44c211da9ae2aadc6c
--- /dev/null
+++ b/assets/txts/pg_0287.txt
@@ -0,0 +1,22 @@
+Publications
+Journal Articles
+Sumam Francis, Jordy Van Landeghem, and Marie-Francine Moens. Transfer
+Learning for Named Entity Recognition in Financial and Biomedical Documents.
+Information, 10(8):248, 2019
+Jordy Van Landeghem, Matthew Blaschko, Bertrand Anckaert, and MarieFrancine Moens. Benchmarking Scalable Predictive Uncertainty in Text
+Classification. IEEE Access, 2022
+
+Peer-reviewed International Conference and Workshop Articles
+Jordy Van Landeghem, Matthew B Blaschko, Bertrand Anckaert, and MarieFrancine Moens. Predictive Uncertainty for Probabilistic Novelty Detection in
+Text Classification. In ICML Workshop on Uncertainty and Robustness in Deep
+Learning, 2020
+Jordy Van Landeghem, Rubèn Tito, Łukasz Borchmann, Michał Pietruszka,
+Dawid Jurkiewicz, Rafał Powalski, Paweł Józiak, Sanket Biswas, Mickaël
+Coustaty, and Tomasz Stanisławek. ICDAR 2023 Competition on Document
+UnderstanDing of Everything (DUDE). In International Conference on
+Document Analysis and Recognition, pages 420–434. Springer, 2023 *Oral
+Presentation
+
+255
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0288.txt b/assets/txts/pg_0288.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ce967af6839379e5024fe965aec9cb27744ca0db
--- /dev/null
+++ b/assets/txts/pg_0288.txt
@@ -0,0 +1,25 @@
+256
+
+PUBLICATIONS
+
+Jordy Van Landeghem, Rubèn Tito, Łukasz Borchmann, Michał Pietruszka,
+Pawel Joziak, Rafal Powalski, Dawid Jurkiewicz, Mickaël Coustaty, Bertrand
+Anckaert, Ernest Valveny, Matthew Blaschko, Marie-Francine Moens, and
+Tomasz Stanisławek. Document Understanding Dataset and Evaluation (DUDE).
+In Proceedings of the IEEE/CVF International Conference on Computer Vision,
+pages 19528–19540, 2023
+Jordy Van Landeghem, Sanket Biswas, Matthew Blaschko, and Marie-Francine
+Moens. Beyond Document Page Classification: Design, Datasets, and Challenges.
+In Proceedings of the IEEE/CVF Winter Conference on Applications of
+Computer Vision, pages 2962–2972, 2024 *Oral Presentation
+Jordy Van Landeghem, Subhajit Maity, Ayan Banerjee, Matthew B Blaschko,
+Marie-Francine Moens, Josep Llados, and Sanket Biswas. DistilDoc: Knowledge
+Distillation for Visually-Rich Document Applications. In Proceedings of the
+IEEE/CVF Conference on Computer Vision and Pattern Recognition (under
+review), 2024
+
+Organized Competitions
+ICDAR2023 Competition on Document UnderstanDing of Everything (DUDE),
+ICDAR, February-May, 2023, https://rrc.cvc.uab.es/?ch=23, Main organizer.
+
+
\ No newline at end of file
diff --git a/assets/txts/pg_0289.txt b/assets/txts/pg_0289.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8214d0ee079917c29e57d16e764fc46de8fb50bf
--- /dev/null
+++ b/assets/txts/pg_0289.txt
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/assets/txts/pg_0290.txt b/assets/txts/pg_0290.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f9799ff2cee861bc09e10c34450b305239a8eb3c
--- /dev/null
+++ b/assets/txts/pg_0290.txt
@@ -0,0 +1,9 @@
+FACULTY OF ENGINEERING SCIENCE
+DEPARTMENT OF COMPUTER SCIENCE
+LANGUAGE INTELLIGENCE & INFORMATION RETRIEVAL LAB
+Celestijnenlaan 200A box 2402
+B-3001 Leuven
+jordy.vanlandeghem@cs.kuleuven.be
+https://liir.cs.kuleuven.be/
+
+
\ No newline at end of file