diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..fd3a6257b65ba0c3ce37c751d3703f362ab16304 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,294 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0031.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0081.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0123.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0155.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0216.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0277.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0015.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0047.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0051.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0054.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0088.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0250.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0009.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0089.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0117.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0241.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0101.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0110.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0208.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0226.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0284.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0060.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0252.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0058.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0099.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0195.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0057.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0105.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0125.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0169.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0184.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0196.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0075.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0236.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0276.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0006.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0156.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0082.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0106.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0157.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0188.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0201.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0225.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0248.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0023.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0116.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0119.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0254.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0278.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0045.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0093.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0182.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0064.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0094.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0104.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0113.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0150.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0189.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0220.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0261.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0011.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0048.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0288.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0034.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0108.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0214.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0287.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0100.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0198.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0227.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0244.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0245.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0270.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0039.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0055.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0086.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0174.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0181.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0266.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0283.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0073.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0080.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0274.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0279.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0036.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0050.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0069.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0053.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0056.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0145.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0027.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0067.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0079.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0013.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0072.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0191.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0263.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0268.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0041.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0136.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0170.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0180.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0200.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0217.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0280.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0016.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0018.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0062.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0122.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0147.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0265.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0215.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0133.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0165.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0166.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0222.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0078.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0171.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0219.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0028.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0107.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0144.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0178.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0190.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0043.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0010.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0021.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0160.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0247.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0063.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0090.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0137.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0159.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0269.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0014.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0026.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0033.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0035.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0046.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0186.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0237.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0179.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0193.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0232.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0109.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0134.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0286.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0003.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0004.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0206.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0251.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0040.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0083.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0230.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0272.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0275.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0096.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0115.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0260.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0271.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0012.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0022.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0176.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0218.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0273.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0065.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0132.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0187.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0267.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0044.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0029.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0084.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0087.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0238.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0253.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0257.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0102.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0103.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0148.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0242.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0258.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0005.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0008.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0032.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0037.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0070.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0207.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0235.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0061.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0068.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0077.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0204.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0239.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0255.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0289.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0025.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0052.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0066.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0131.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0163.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0259.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0224.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0249.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0121.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0140.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0143.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0151.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0095.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0111.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0139.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0211.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0019.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0076.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0152.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0212.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0223.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0017.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0142.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0158.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0233.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0256.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0262.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0282.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0020.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0024.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0199.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0264.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0002.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0092.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0120.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0071.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0074.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0203.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0285.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0085.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0127.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0185.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0281.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0098.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0112.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0141.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0146.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0164.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0240.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0246.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0097.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0149.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0162.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0030.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0049.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0177.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0209.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0213.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0059.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0091.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0129.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0172.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0175.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0183.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0194.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0231.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0001.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0130.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0168.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0202.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0210.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0234.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0038.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0042.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0114.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0124.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0138.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0153.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0154.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0161.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0173.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0221.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0229.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0118.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0126.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0135.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0167.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0192.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0290.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0007.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0128.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0197.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0243.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0205.pdf filter=lfs diff=lfs merge=lfs -text +assets/pdfs/pg_0228.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/OCR_directory.sh b/OCR_directory.sh new file mode 100644 index 0000000000000000000000000000000000000000..48fa92b71c41dfc9e0269f303709dd0bfc17fda6 --- /dev/null +++ b/OCR_directory.sh @@ -0,0 +1,17 @@ +# pdftk thesis.pdf burst + +#using pdf2text, extract text for each page in assets/pdfs and store in asssets/txts with similar basename + +for pdf in assets/pdfs/*.pdf +do + echo + #pdftotext $pdf assets/txts/$(basename $pdf .pdf).txt + #pdf2txt.py -o assets/txts/$(basename $pdf .pdf).txt $pdf +done + + +for pdf in assets/pdfs/*.pdf +do + convert -density 100 -quality 100 -colorspace RGB -alpha remove -alpha off $pdf assets/pngs/$(basename $pdf .pdf).png +done + diff --git a/app.py b/app.py index cbfee3ba4892f516b6e52d77f1ad51e7ccf5fe86..c73256166c40d3e087d7583ffc36ee4cf7370b8f 100644 --- a/app.py +++ b/app.py @@ -1,67 +1,114 @@ -import streamlit as st -from llama_index import VectorStoreIndex -from llama_index import ServiceContext -from llama_index.embeddings import HuggingFaceEmbedding -from llama_index.llms import HuggingFaceInferenceAPI -from llama_index.schema import Document -from PyPDF2 import PdfReader - -# Streamlit title and description -st.title("PDF querying using Llama-Index by Rahul Bhoyar") -st.write("Base Model: **HuggingFaceH4/zephyr-7b-alpha (open-source from HuggingFace)**") -st.write("Embedding Model: **WhereIsAI/UAE-Large-V1 (open-source from HuggingFace)**") -st.write("This app allows you to upload your own PDF and query your document.") - -hf_token = st.text_input("Enter your Hugging Face token:") - - -def read_pdf(uploaded_file): - pdf_reader = PdfReader(uploaded_file) - text = "" - for page_num in range(len(pdf_reader.pages)): - text += pdf_reader.pages[page_num].extract_text() - return text - - -# Streamlit input for user file upload -success = False -query_engine_creation = False -uploaded_pdf = st.file_uploader("Upload your PDF", type=['pdf']) - -# Load data and configure the index -if uploaded_pdf is not None: - file_contents = read_pdf(uploaded_pdf) - documents = Document(text=file_contents) - documents = [documents] - st.success("Documents loaded successfully!") - - model = st.selectbox('Select the model', ('google/flan-t5-xxl','HuggingFaceH4/zephyr-7b-alpha'), index=0) - llm = HuggingFaceInferenceAPI(model_name=model, token=hf_token) - - with st.spinner('Creating Vector Embeddings...'): - embed_model_uae = HuggingFaceEmbedding(model_name="WhereIsAI/UAE-Large-V1") - service_context = ServiceContext.from_defaults( - llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embed_model_uae - ) - index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True) - index.storage_context.persist() - query_engine = index.as_query_engine() - query_engine_creation = True - # Display the result of the task - st.success("Vector embeddings created.") - success = True -else: - st.write("Please upload a file first.") - -if query_engine_creation: - - # Streamlit input for user query - if success: - user_query = st.text_input("Enter your query:") - - # Query engine with user input - if user_query: - with st.spinner('Fetching the response...'): - response = query_engine.query(user_query) - - st.markdown(f"**Response:** {response}") +import torch +from transformers import BitsAndBytesConfig +from llama_index.llms.huggingface import HuggingFaceLLM +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core import SimpleDirectoryReader +from llama_index.core import VectorStoreIndex, SummaryIndex +from llama_index.core.prompts import PromptTemplate +from llama_index.core import Settings + + +import gradio as gr + + +def messages_to_prompt(messages): + prompt = "" + for message in messages: + if message.role == "system": + m = "You are an expert in the research field of document understanding, bayesian deep learning and neural networks." + prompt += f"<|system|>\n{m}\n" + elif message.role == "user": + prompt += f"<|user|>\n{message.content}\n" + elif message.role == "assistant": + prompt += f"<|assistant|>\n{message.content}\n" + + # ensure we start with a system prompt, insert blank if needed + if not prompt.startswith("<|system|>\n"): + prompt = "<|system|>\n\n" + prompt + + # add final assistant prompt + prompt = prompt + "<|assistant|>\n" + + return prompt + + +def load_RAG_pipeline(): + # LLM + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + ) + + llm = HuggingFaceLLM( + model_name="HuggingFaceH4/zephyr-7b-alpha", + tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", + query_wrapper_prompt=PromptTemplate("<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n"), + context_window=3900, + max_new_tokens=256, + model_kwargs={"quantization_config": quantization_config}, + # tokenizer_kwargs={}, + generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, + messages_to_prompt=messages_to_prompt, + device_map="auto", + ) + + # Llama-index + Settings.llm = llm + Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") + # Settings.chunk_size = 512 + # Settings.chunk_overlap = 50 + + # raw data + documents = SimpleDirectoryReader("assets/txts").load_data() + vector_index = VectorStoreIndex.from_documents(documents) + # summary_index = SummaryIndex.from_documents(documents) + query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=3) + return query_engine + + +query_engine = load_RAG_pipeline() + + +# These are placeholder functions to simulate the behavior of the RAG setup. +# You would need to implement these with the actual logic to retrieve and generate answers based on the document. +def get_answer(question, temperature, nucleus_sampling, max_tokens): + # Here you should implement the logic to generate an answer based on the question and the document. + # For example, you could use a machine learning model for RAG. + # answer = "This is a placeholder answer." + # https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations + return query_engine.query(question) + + +def get_answer_page(question): + # Implement logic to retrieve the page number or an image of the page with the answer. + answer_page = "Page X - placeholder image." + return answer_page + + +# Create the gr.Interface function +def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens): + answer = get_answer(question, temperature, nucleus_sampling, max_tokens) + answer_page = get_answer_page(question) + return answer, answer_page + + +# Set up the interface options based on the design in the image. +iface = gr.Interface( + fn=ask_my_thesis, + inputs=[ + gr.Textbox(label="Question", placeholder="Type your question here..."), + gr.Slider(0, 1, value=0.7, label="Temperature"), + gr.Slider(0, 1, value=0.9, label="Nucleus Sampling"), + gr.Slider(1, 500, value=100, label="Max Generated Number of Tokens"), + ], + outputs=[gr.Textbox(label="Answer"), gr.Image(label="Answer Page")], + title="Ask my thesis", + description="Chat with the manuscript: ask questions and receive answers with references.", + allow_flagging="never", +) + +# Start the application. +if __name__ == "__main__": + iface.launch() diff --git a/assets/txts/pg_0002.txt b/assets/txts/pg_0002.txt new file mode 100644 index 0000000000000000000000000000000000000000..8214d0ee079917c29e57d16e764fc46de8fb50bf --- /dev/null +++ b/assets/txts/pg_0002.txt @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/txts/pg_0003.txt b/assets/txts/pg_0003.txt new file mode 100644 index 0000000000000000000000000000000000000000..83e60fa37e6fcb448f5b9ea80c6f69fe266013cf --- /dev/null +++ b/assets/txts/pg_0003.txt @@ -0,0 +1,25 @@ +Intelligent Automation for AI-Driven Document +Understanding + +Jordy VAN LANDEGHEM + +Examination committee: +em. Prof. Dr. ir. Jean-Pierre Celis, chair +Prof. Dr. Marie-Francine Moens, supervisor +Prof. Dr. Matthew B. Blaschko, supervisor +Prof. Dr. ir. Johan Suykens +Prof. Dr. ir. Tinne Tuytelaars +Prof. Dr. Marcus Rohrbach +(TU Darmstadt) +Prof. Dr. Wenpeng Yin +(Penn State University) +Dr. Bertrand Anckaert +(Contract.fit) +March 2024 + +Dissertation presented in partial +fulfillment of the requirements for +the degree of Doctor of Engineering +Science (PhD): Computer Science + + \ No newline at end of file diff --git a/assets/txts/pg_0004.txt b/assets/txts/pg_0004.txt new file mode 100644 index 0000000000000000000000000000000000000000..74e9321bf34d335608e0753358c62f97ea7559ac --- /dev/null +++ b/assets/txts/pg_0004.txt @@ -0,0 +1,10 @@ +© 2024 KU Leuven – Faculty of Engineering Science +Uitgegeven in eigen beheer, Jordy Van Landeghem, Celestijnenlaan 200A box 2402, B-3001 Leuven (Belgium) + +Alle rechten voorbehouden. Niets uit deze uitgave mag worden vermenigvuldigd en/of openbaar gemaakt worden +door middel van druk, fotokopie, microfilm, elektronisch of op welke andere wijze ook zonder voorafgaande +schriftelijke toestemming van de uitgever. +All rights reserved. No part of the publication may be reproduced in any form by print, photoprint, microfilm, +electronic or any other means without written permission from the publisher. + + \ No newline at end of file diff --git a/assets/txts/pg_0005.txt b/assets/txts/pg_0005.txt new file mode 100644 index 0000000000000000000000000000000000000000..92cd5a79125a60f2b827b17c151c47cf1b15932d --- /dev/null +++ b/assets/txts/pg_0005.txt @@ -0,0 +1,32 @@ +Preface +This journey has been long and arduous, but I have finally reached an end. At +this end, I have a thesis that I am proud of, and I have learned a lot. As I look +back, I have been very fortunate to have had the support of many people, and I +would like to take this opportunity to thank them. +First and foremost, I would like to thank my supervisors, Sien and Matthew, +for their guidance and support throughout this journey. Sien has taught me +the importance of being thorough and meticulous, striving for diligence and +perfection from the get-go. I still remember how patiently she helped me with +my first paper, holding a Sunday afternoon call from her attic/home-office, +helping me hone the presentation and writing. Involving Matthew as the cosupervisor has been the best decision for my personal development, as he offered +a different perspective on my work, always challenging me to look at problems +from the lens of statistical theory and machine learning fundamentals. My +knee-jerk reaction to start implementing things as soon as possible was often +met with a “slow down, think about it first” from Matthew, which has been +invaluable in my development as a researcher. I am grateful to both of them +for their patience and understanding, and for giving me the freedom to explore +my own ideas and interests. +Next, a sincere thanks to my jury members, for taking the time to read my +thesis and for their valuable feedback. Furthermore, I would like to thank +het Vlaams Agentschap Innoveren & Ondernemen (VLAIO) for awarding the +Baekeland grant without which this PhD would not have been possible. +Pol & Bertrand, thanks for having me contribute to your dream to rid the +world of boring administrative processes and paperwork. Technically my bosses, +but in reality you are the embodiment of leadership by example, and I am +grateful for the many lessons I have learned from you. I am grateful for the +many opportunities you have given me to grow as a researcher and as a person. +Many thanks to my past and present colleagues at Contract.fit, for always + +i + + \ No newline at end of file diff --git a/assets/txts/pg_0006.txt b/assets/txts/pg_0006.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3d5790043833b5e7d169c207f84dfd98dc4de22 --- /dev/null +++ b/assets/txts/pg_0006.txt @@ -0,0 +1,45 @@ +ii + +PREFACE + +preaching automation, inspiring me, and for having fun along the way. I am +grateful to my LIIR colleagues at KU Leuven, particularly the folks from office +4.34 for the many interesting discussions and whiteboard sessions, whenever I +occasionally popped into the office. +I was fortunate to travel to many places during my PhD (Lausanne, Lisbon, +Barcelona, San Jose, Paris, Waikoloa), and I have met many people along the +way. My DUDEs, you have been the trigger to complete my PhD, reinvigorating +my passion for research and inspiring me for my future career. How crazy is it +that we conceived the seeds of the DUDE +project in a pirates bar, on a +hotel rooftop, and from a hospital bed after my back surgery? +Finally, I would like to thank my family and friends for their support and +encouragement throughout this journey. My parents, Peter en Nadine, you +have showed me that hard work pays off, and merci for the many sacrifices you +have made to give me the best possible education and life. Marijke, you are +the love of my life, and although I am not religious, you are my goddess, de +mammiej. Feliz, when you came into our lives, you added an extra dimension. +I used to see in 2D, now I see in 3D. Forever your father, your pappiej. Wes en +Jen, thanks for showing me to never give up, keep on pushing, even when you +are at your lowest, there is a way out, and only hard work will get you there. +Cornbois -Bryan, Emile, (even) Jan, for our friendship, I fail to make an +exhaustive definition. I wish for many more years of friendship from my likeminded brothers. John, Teunen, Wannes, if there is ever a zombie apocalypse, I +know that I can count on you to have my window. Kessel-city - Poohke, Vinny, +Kweinch etc., thanks for keeping on pushing the bar higher, and inspiring me +with your ambition and drive. Gustaf, thanks for the many laughs (#velleke) +and the much-needed distraction. Elstipoes, you are my oldest friend, and I am +grateful for the many years of friendship. Woutje, thanks for your contagious +optimism and the mancave during university. Leuvenbende, you were the +ones that made university fun and enjoyable. Individually and together you are +beautiful people, and I cherish our yearly reunions. Lauren en Yannick, thanks +for letting me win at Mario Kart. I might be forgetting some people, but I +would like to thank all my friends for bringing joy, for keeping me grounded, +and for reminding me that there is more to life than work. +Having studied literature in my Bachelor’s, it feels appropriate to finish with a +quote wrongly attributed to Ernest Hemingway: “Write drunk; edit sober.” +Jordy Van Landeghem +Gurdo, Pogomeister, Jorre, De Van Laaandeghem +February, 2024 +Kessel, Belgium + + \ No newline at end of file diff --git a/assets/txts/pg_0007.txt b/assets/txts/pg_0007.txt new file mode 100644 index 0000000000000000000000000000000000000000..270758d7b3a3540df394e67d4f430799543303da --- /dev/null +++ b/assets/txts/pg_0007.txt @@ -0,0 +1,33 @@ +Abstract +Human communication is increasingly document-based, requiring machines +to understand a wide variety of visually-rich documents to assist humans in +their daily lives. Amid the digital evolution, documents continue to facilitate +crucial human and organizational interactions but are tethered to manual +processing, causing inefficiency. We examine why organizations lag in adopting +automated document processing solutions and outline two primary challenges: +the complexity of processing long, multimodal documents algorithmically and +the necessity for reliability and control over associated risks. Automated decisionmaking is key to improving the efficiency of document processing, but the current +state-of-the-art technology is not yet reliable and robust enough to be deployed +in autonomous systems. +The practical objective set is to develop Intelligent Automation () systems +capable of estimating confidence in their actions, thereby increasing throughput +without accruing additional costs due to errors. We analyze the key challenges +and propose solutions to bridge the gap between research and practical +applications, with a focus on realistic datasets and experimental methodologies. +Building upon foundations of Document Understanding (), this dissertation +introduces advanced methodologies combining Machine Learning, Natural +Language Processing, and Computer Vision. +Addressing the evident gaps in research, this work presents novel methods +for predictive uncertainty quantification () alongside practical frameworks for +evaluating the robustness and reliability of DU technologies. The contribution +culminates in the introduction of two novel multipage document classification +datasets and a multifaceted benchmark, DUDE +, designed to rigorously +challenge and assess the state-of-the-art in DU. Extensive experiments across +these datasets reveal that while advancements have been made, significant +room for improvement remains, particularly in long-context modeling for +multipage document processing and calibrated, selective document visual +question answering. Efficient DU is also explored, revealing the effectiveness of +iii + + \ No newline at end of file diff --git a/assets/txts/pg_0008.txt b/assets/txts/pg_0008.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fcb7c74fe32fe733c89c3782a15c5c00fabb6c3 --- /dev/null +++ b/assets/txts/pg_0008.txt @@ -0,0 +1,35 @@ +iv + +ABSTRACT + +knowledge distillation () model compression in visually-rich document layout +analysis () and classification. +Through empirical studies and methodological contributions, this dissertation +has the following contributions and findings: +First, in a benchmarking study of established methods on real-world text +classification, we find that our novel hybrid method ‘Concrete Dropout +Ensemble’ performs best, enhancing in-domain calibration and novel class +detection, even at a smaller ensemble size. Detailed ablation experiments +reveal the impact of prior, neural architecture, and hyperparameter choices on +estimation quality. +Second, on a prototypical DU task, we identify challenges in DU progress +and propose a formalization of multipage document classification scenarios, +constructed novel datasets, and conducted an experimental analysis showing +the promise of multipage representation learning and inference. +Third, we introduce DUDE, incorporating multifaceted challenges and principles +for a comprehensive evaluation of generic DU. Next to our own benchmarking, +we organize a competition, revealing that while newer document foundation +models show promise, they struggle with questions involving visual evidence or +complex reasoning. Moreover, we find severe problems in the ability of Large +Language Models (s) to reason about documents in their entirety, highlighting +issues with hallucination, long-context reasoning and control. +Fourth, we propose the first methodology for enriching documents with semantic +layout structure using distilled DLA models. We apply KD to visual document +tasks, unraveling the influence of various task and architecture components. +Finally, the dissertation concludes with a discussion of the findings and +implications for future research, emphasizing the need for advancements in +multipage document representation learning and the importance of realistic +datasets and experimental methodologies to measurably move forward to reliable +and robust IA-DU technology. + + \ No newline at end of file diff --git a/assets/txts/pg_0009.txt b/assets/txts/pg_0009.txt new file mode 100644 index 0000000000000000000000000000000000000000..e510cdecf65e2ade2cee9d25ddae51979c3cad8f --- /dev/null +++ b/assets/txts/pg_0009.txt @@ -0,0 +1,34 @@ +Beknopte samenvatting +Menselijke communicatie is in toenemende mate documentgebaseerd, waarbij +machines een breed aanbod aan visueel-rijke documenten moeten begrijpen +om mensen in hun dagelijks leven te assisteren. Te midden van de digitale +evolutie blijven documenten cruciale menselijke en organisatorische interacties +faciliteren, maar zijn ze gebonden aan handmatige verwerking, wat inefficiëntie +veroorzaakt. We onderzoeken waarom organisaties achterblijven bij het +adopteren van geautomatiseerde documentverwerkingsoplossingen en schetsen +twee primaire uitdagingen: de complexiteit van het algoritmisch verwerken van +lange, multimodale documenten en de noodzaak van betrouwbaarheid en controle +over daarmee samenhangende risico’s. Geautomatiseerde besluitvorming is +essentieel voor het verbeteren van de efficiëntie van documentverwerking, maar +de huidige stand van de technologie is nog niet betrouwbaar en robuust genoeg +om ingezet te worden in autonome toepassingen. +Het praktische doel dat gesteld wordt, is het ontwikkelen van systemen voor +Intelligente Automatisering (IA) die in staat zijn om vertrouwen in hun acties te +schatten, daarmee de doorvoer verhogend zonder extra kosten vanwege fouten. +We analyseren de belangrijkste uitdagingen en stellen oplossingen voor om de +kloof tussen onderzoek en praktische toepassingen te overbruggen, met een focus +op realistische datasets en experimentele methodologieën. Voortbouwend op +de fundamenten van Documentinterpretatie (DI), introduceert dit proefschrift +geavanceerde methodologieën die Machinaal Leren, Natuurlijke Taalverwerking +en Computer Visie combineren. +Door de duidelijke hiaten in onderzoek aan te pakken, presenteert dit werk +nieuwe methoden voor predictieve onzekerheidskwantificering (POK) naast +praktische kaders voor het evalueren van de robuustheid en betrouwbaarheid +van DI-technologieën. De bijdrage culmineert in de introductie van twee +nieuwe datasets voor classificatie van multipagina documenten en een veelzijdige +benchmark, DUDE +, ontworpen om de state-of-the-art in DI rigoureus +uit te dagen en te beoordelen. Uitgebreide experimenten met deze datasets +v + + \ No newline at end of file diff --git a/assets/txts/pg_0010.txt b/assets/txts/pg_0010.txt new file mode 100644 index 0000000000000000000000000000000000000000..4550822d7849f128d14da308982dbac3ba22a0b0 --- /dev/null +++ b/assets/txts/pg_0010.txt @@ -0,0 +1,44 @@ +vi + +BEKNOPTE SAMENVATTING + +onthullen dat er weliswaar vooruitgang is geboekt, maar dat er nog significant +veel ruimte is voor verbetering, met name in de lange-contextmodellering voor +de verwerking van multipagina documenten en gekalibreerd, selectief visueel +vraagbeantwoording van documenten. Meer schaalbaar DI wordt ook verkend, +waarbij de effectiviteit van kennisdistillatie (KD) voor modelcompressie in +visueel-rijke layoutanalyse (DLA) en classificatie van documenten aan het licht +komt. +Door middel van empirische studies en methodologische bijdragen, heeft dit +proefschrift de volgende bijdragen en bevindingen: +Ten eerste vinden we in een benchmarkstudie van gevestigde POK-methoden +op tekstclassificatie in de echte wereld dat onze nieuwe hybride POK-methode +’Concrete Dropout Ensemble’ het beste presteert, de kalibratie binnenshuis +verbeterend en detectie van nieuwe klassen, zelfs met een kleiner ensemble. +Gedetailleerde ablatie-experimenten onthullen de impact van voorafgaande +kennis, neurale architectuur en keuzes van hyperparameters op de kwaliteit van +POK-schatting. +Ten tweede identificeren we uitdagingen in de vooruitgang van DI en stellen een +formalisatie voor van multipagina documentclassificatiescenario’s, bouwen novel +datasets, en voeren een experimentele analyse uit die de belofte van multipagina +representatie-leren en inferentie toont. +Ten derde introduceren we DUDE, waarin veelzijdige uitdagingen en principes +worden voorgesteld voor een uitgebreide evaluatie. +Naast onze eigen +benchmarking organiseren we een competitie, waaruit blijkt dat hoewel nieuwere +modellen veelbelovend zijn, ze het moeilijk hebben met vragen die visueel bewijs +of complex redeneren vereisen. Bovendien vinden we ernstige problemen in het +vermogen van Grote Taalmodellen (LLMs) om over documenten in hun geheel +te redeneren, wat problemen benadrukt met hallucinatie, redeneren met lange +context en controle. +Ten vierde stellen we de eerste experimentele methodologie voor om documenten +te verrijken met semantische layoutstructuur met behulp van gedestilleerde +DLA-modellen. We passen KD toe op visuele documenttaken, waarbij we de +invloed van verschillende architectuurcomponenten van taken ontrafelen. +Ten slotte sluit het proefschrift af met een bespreking van de bevindingen en +implicaties voor toekomstig onderzoek, waarbij de noodzaak wordt benadrukt +voor vooruitgang in multipagina documentrepresentatie-leren en het belang van +realistische datasets en experimentele methodologieën om meetbaar vooruitgang +te boeken naar betrouwbare en robuuste IA-DI technologie. + + \ No newline at end of file diff --git a/assets/txts/pg_0013.txt b/assets/txts/pg_0013.txt new file mode 100644 index 0000000000000000000000000000000000000000..2589af835d77d115f558fc3b7c2dafd6fd1abf18 --- /dev/null +++ b/assets/txts/pg_0013.txt @@ -0,0 +1,22 @@ +List of Abbreviations +AAPD Arxiv Academic Paper Dataset +Acc_ID Accuracy in-domain +Acc_OOD Accuracy out of domain +AI Artificial Intelligence +ANLS Average Normalized Levenshtein Similarity +AUPR Area Under the Precision-Recall Curve +AURC Area-Under-Risk-Coverage-Curve +AUROC Area Under the Receiver Operating Characteristic curve +BDL Bayesian Deep Learning +BNN Bayesian Neural Network +BPM Business Process Management +CE Cross-Entropy +CER Character Error Rate +COCO Common Objects in Context +CSF Confidence Scoring Function +CV Computer Vision +DC Document Classification +DG Document Generation +ix + + \ No newline at end of file diff --git a/assets/txts/pg_0014.txt b/assets/txts/pg_0014.txt new file mode 100644 index 0000000000000000000000000000000000000000..b79bccd5dd81bba4301749b161306cdf80403810 --- /dev/null +++ b/assets/txts/pg_0014.txt @@ -0,0 +1,30 @@ +x + +List of Abbreviations + +DL Deep Learning +DLA Document Layout Analysis +DNN Deep Neural Network +DocAI Document AI +DocVQA Document Visual Question Answering +DOD Document Object Detection +DU Document Understanding +DUDE Document UnderstanDing of Everything +ECE Expected Calibration Error +ELBO Evidence Lower Bound +ERM Empirical Risk Minimization +FasterRCNN Faster Region-based Convolutional Neural Network +FP False Positives +IA Intelligent Automation +ICDAR International Conference on Document Analysis and Recognition +IDP Intelligent Document Processing +i.i.d. Independent and Identically Distributed +IOB/IOBES Inside, Outside, Beginning / End, Single +KD Knowledge Distillation +KIE Key Information Extraction +LLM Large Language Model +MAP Maximum-a-Posteriori +mAP Mean Average Precision +MCD Monte Carlo Dropout + + \ No newline at end of file diff --git a/assets/txts/pg_0015.txt b/assets/txts/pg_0015.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6f921d4e7c76d2c47d65bafbf05adc7e167ec3 --- /dev/null +++ b/assets/txts/pg_0015.txt @@ -0,0 +1,30 @@ +List of Abbreviations + +MCMC Markov Chain Monte-Carlo +MDLT Multi-Domain Long-Tailed Recognition +MECE Mutually Exclusive and Collectively Exhaustive +MI Mutual Information +ML Machine Learning +MSE Mean Squared Error +MSP Maximum Softmax Probability +MU Model Uncertainty +NLG Natural Language Generation +NLL Negative Log Likelihood +NLP Natural Language Processing +NN Neural Network +OCR Optical Character Recognition +OOD Out-of-Distribution +PCC Pearson Correlation Coefficient +PUQ Predictive Uncertainty Quantification +RERM Regularized Empirical Risk Minimization +ResNet Residual Network +RPA Robotic Process Automation +SaaS Software-as-a-service +SNGP Spectral-normalized Neural Gaussian Process +SOTA State-of-the-art +STP Straight-Through-Processing +TSR Table Structure Recognition + +xi + + \ No newline at end of file diff --git a/assets/txts/pg_0016.txt b/assets/txts/pg_0016.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed6aa887a225e20ba41d199e062db8224ea88510 --- /dev/null +++ b/assets/txts/pg_0016.txt @@ -0,0 +1,12 @@ +xii + +VDU Visual Document Understanding +VI Variational Inference +VLM Vision Language Model +VQA Visual Question Answering +VRD Visually-Rich Document +WER Word Error Rate + +LIST OF ABBREVIATIONS + + \ No newline at end of file diff --git a/assets/txts/pg_0017.txt b/assets/txts/pg_0017.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdaa9c3763438226f6545313ed513efd4e5b5fb9 --- /dev/null +++ b/assets/txts/pg_0017.txt @@ -0,0 +1,200 @@ +Contents +Abstract + +iii + +Beknopte samenvatting + +v + +List of Abbreviations + +xii + +Contents + +xiii + +List of Figures + +xix + +List of Tables + +xxv + +1 Introduction +1.1 Research Context . . . . . . . . . . . . . . . . . . . . . . +1.2 Problem Statement and Questions . . . . . . . . . . . . +1.2.1 Reliable and Robust Deep Learning . . . . . . . +1.2.2 Realistic and Efficient Document Understanding +1.3 Outline . . . . . . . . . . . . . . . . . . . . . . . . . . . + +. +. +. +. +. + +. +. +. +. +. + +. +. +. +. +. + +. +. +. +. +. + +1 +4 +6 +6 +7 +9 + +2 Fundamentals +2.1 Statistical Learning . . . . . . . . . . . . . . . . +2.1.1 Neural Networks . . . . . . . . . . . . . +2.1.2 Probabilistic Evaluation . . . . . . . . . +2.1.3 Architectures . . . . . . . . . . . . . . . +2.1.3.1 Convolutional Neural Networks +2.1.3.2 Language Neural Networks . . +2.1.3.3 Transformer Network . . . . . +2.2 Reliability and Robustness . . . . . . . . . . . . +2.2.1 Generalization and Adaptation . . . . . +2.2.2 Confidence Estimation . . . . . . . . . . +2.2.3 Evaluation Metrics . . . . . . . . . . . . + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +11 +12 +14 +15 +16 +17 +18 +19 +21 +22 +23 +24 + +xiii + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + + \ No newline at end of file diff --git a/assets/txts/pg_0018.txt b/assets/txts/pg_0018.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b5587f64c55836f280f605559f2fbc81e8fff8d --- /dev/null +++ b/assets/txts/pg_0018.txt @@ -0,0 +1,204 @@ +xiv + +CONTENTS + +2.3 + +2.4 + +I + +2.2.4 Calibration . . . . . . . . . . . . . . . . +2.2.5 Predictive Uncertainty Quantification . +2.2.6 Failure Prediction . . . . . . . . . . . . +Document Understanding . . . . . . . . . . . . +2.3.1 Task Definitions . . . . . . . . . . . . . +2.3.2 Datasets . . . . . . . . . . . . . . . . . . +2.3.3 Models . . . . . . . . . . . . . . . . . . +2.3.4 Challenges in Document Understanding +2.3.4.1 Long-Context Modeling . . . . +2.3.4.2 Document Structure Modeling +Intelligent Automation . . . . . . . . . . . . . . + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +Reliable and Robust Deep Learning + +3 Benchmarking Scalable Predictive Uncertainty in Text Classification +3.1 Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . +3.2 Related Work . . . . . . . . . . . . . . . . . . . . . . . . . . . . +3.3 Uncertainty Methods . . . . . . . . . . . . . . . . . . . . . . . . +3.3.1 Quantifying Uncertainty in Deep Learning . . . . . . . . +3.3.2 Predictive Uncertainty Methods . . . . . . . . . . . . . +3.3.2.1 Monte Carlo Dropout . . . . . . . . . . . . . . +3.3.2.2 Deep Ensemble . . . . . . . . . . . . . . . . . . +3.3.2.3 Concrete Dropout . . . . . . . . . . . . . . . . +3.3.2.4 Heteroscedastic Extensions . . . . . . . . . . . +3.3.3 Uncertainty Estimation . . . . . . . . . . . . . . . . . . +3.3.4 Motivating Hybrid Approaches . . . . . . . . . . . . . . +3.3.5 Uncertainty Calibration under Distribution Shift . . . . +3.4 Experimental Methodology . . . . . . . . . . . . . . . . . . . . +3.4.1 Proposed Hybrid Approaches . . . . . . . . . . . . . . . +3.4.2 Datasets . . . . . . . . . . . . . . . . . . . . . . . . . . . +3.4.3 Architecture . . . . . . . . . . . . . . . . . . . . . . . . +3.4.4 Evaluation metrics . . . . . . . . . . . . . . . . . . . . . +3.4.5 Experimental design . . . . . . . . . . . . . . . . . . . . +3.4.5.1 In-domain Setting . . . . . . . . . . . . . . . . +3.4.5.2 Cross-domain Setting . . . . . . . . . . . . . . +3.4.5.3 Novelty Detection Setting . . . . . . . . . . . . +3.5 Results . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . +3.5.1 Experiment: In-domain . . . . . . . . . . . . . . . . . . +3.5.2 Experiment: Cross-domain . . . . . . . . . . . . . . . . +3.5.3 Experiment: Novelty Detection . . . . . . . . . . . . . . +3.5.4 Experiment: Ablations . . . . . . . . . . . . . . . . . . . +3.5.4.1 Diversity . . . . . . . . . . . . . . . . . . . . . + +28 +30 +32 +33 +35 +36 +37 +38 +39 +40 +41 + +43 +44 +46 +48 +51 +51 +52 +53 +53 +54 +54 +55 +58 +59 +61 +61 +63 +64 +66 +66 +67 +67 +68 +69 +70 +71 +73 +75 +76 + + \ No newline at end of file diff --git a/assets/txts/pg_0019.txt b/assets/txts/pg_0019.txt new file mode 100644 index 0000000000000000000000000000000000000000..724ad053415c4d63da70b2d88aa4cd7f2cc9877e --- /dev/null +++ b/assets/txts/pg_0019.txt @@ -0,0 +1,394 @@ +CONTENTS + +3.6 +3.7 + +3.8 +3.9 + +II + +xv + +3.5.4.2 NLP Architecture . . . . . . . . . . +3.5.4.3 Ensemble size M . . . . . . . . . . . +3.5.4.4 Concrete Dropout p . . . . . . . . . +Discussion . . . . . . . . . . . . . . . . . . . . . . . . +Additional Uncertainty Approaches . . . . . . . . . . +3.7.1 Stochastic Gradient MCMC Methods . . . . +3.7.2 Spectral-normalized Neural Gaussian Process +3.7.2.1 SNGP Results . . . . . . . . . . . . +3.7.2.2 SNGP Discussion . . . . . . . . . . +Limitations . . . . . . . . . . . . . . . . . . . . . . . +Chapter Conclusion . . . . . . . . . . . . . . . . . . + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. + +Realistic and Efficient Document Understanding + +4 Beyond Document Page Classification: Design, +Challenges +4.1 Introduction . . . . . . . . . . . . . . . . . . . . +4.2 Problem Formulation . . . . . . . . . . . . . . . +4.3 Balancing Research & Applications . . . . . . . +4.4 Experimental Study . . . . . . . . . . . . . . . +4.5 Challenges and Guidelines . . . . . . . . . . . . +4.5.1 Divergence of Tasks: f . . . . . . . . . . +4.5.2 Divergence of Label Space: Y . . . . . . +4.5.3 Divergence of Input Data: X . . . . . . +4.5.4 Maturity of Evaluation Methodology . . +4.6 Chapter Conclusion . . . . . . . . . . . . . . . +5 Document UnderstanDing of Everything (DUDE +5.1 Introduction . . . . . . . . . . . . . . . . . . . +5.2 Related Work . . . . . . . . . . . . . . . . . . +5.3 DUDE Dataset . . . . . . . . . . . . . . . . +5.3.1 Gathering Documents . . . . . . . . . +5.3.2 Annotation Process . . . . . . . . . . +5.3.3 Dataset Statistics . . . . . . . . . . . . +5.3.4 Diagnostic Subsets . . . . . . . . . . . +5.3.5 Evaluation . . . . . . . . . . . . . . . +5.4 DUDE Competition . . . . . . . . . . . . . . +5.4.1 Challenge Objectives . . . . . . . . . . +5.4.2 Challenge Contributions . . . . . . . . +5.4.3 Motivation and Scope . . . . . . . . . +5.4.3.1 Desired Generalization. . . . + +. +. +. +. +. +. +. +. +. +. +. +. +. + +77 +79 +80 +81 +85 +86 +87 +88 +90 +90 +91 + +94 + +Datasets, and +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +95 +97 +98 +101 +104 +107 +107 +108 +109 +111 +111 + +. +. +. +. +. +. +. +. +. +. +. +. +. + +) +. . +. . +. . +. . +. . +. . +. . +. . +. . +. . +. . +. . +. . + +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. + +113 +116 +117 +118 +121 +121 +123 +125 +126 +128 +128 +129 +129 +130 + + \ No newline at end of file diff --git a/assets/txts/pg_0020.txt b/assets/txts/pg_0020.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2b9d2d9eaff250d78410357af3a9b4945d5bc6a --- /dev/null +++ b/assets/txts/pg_0020.txt @@ -0,0 +1,320 @@ +xvi + +CONTENTS + +5.4.4 + +5.5 +5.6 + +5.7 +5.8 + +DUDE Competition Protocol . . . . . . . . +5.4.4.1 Task Formulation . . . . . . . . . . +5.4.4.2 Evaluation Protocol . . . . . . . . . +DUDE Benchmark . . . . . . . . . . . . . . . . . . +5.5.1 Baselines . . . . . . . . . . . . . . . . . . . . +5.5.2 Analysis & Discussion . . . . . . . . . . . . . +Detailed Results Analysis . . . . . . . . . . . . . . . +5.6.1 Within Model Class Analysis . . . . . . . . . +5.6.1.1 Encoder vs. Decoder . . . . . . . . +5.6.1.2 Incorporating Layout & Vision . . . +5.6.1.3 Toward Long Document Processing +5.6.1.4 Diagnosis of LLM Results . . . . . . +5.6.2 Assessing Confidence . . . . . . . . . . . . . . +DUDE Competition Results . . . . . . . . . . . . . +5.7.1 Submitted Methods . . . . . . . . . . . . . . +5.7.2 Performance Analysis . . . . . . . . . . . . . +Chapter Conclusion . . . . . . . . . . . . . . . . . . + +6 DistilDoc: Knowledge Distillation for Visually-Rich +Applications +6.1 Introduction . . . . . . . . . . . . . . . . . . . . . . +6.2 Related Work . . . . . . . . . . . . . . . . . . . . . +6.3 Experimental Setup . . . . . . . . . . . . . . . . . +6.3.1 Datasets . . . . . . . . . . . . . . . . . . . . +6.3.2 Architectures and Backbones . . . . . . . . +6.3.3 KD Methods . . . . . . . . . . . . . . . . . +6.3.4 Evaluation . . . . . . . . . . . . . . . . . . +6.3.5 DLA-enriched LLM prompting . . . . . . . +6.4 Results & Discussion . . . . . . . . . . . . . . . . . +6.5 Chapter Conclusion . . . . . . . . . . . . . . . . . + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +131 +132 +132 +133 +133 +134 +136 +136 +136 +136 +136 +137 +138 +138 +138 +139 +144 + +Document +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. + +145 +147 +149 +151 +152 +153 +155 +157 +158 +158 +163 + +7 Conclusion +7.1 Summary . . . . . . . . . . . . . . . . . . . . . . . . . +7.2 Perspectives For Future Research . . . . . . . . . . . . +7.2.1 Open Problems In Reliability & Robustness . . +7.2.2 A Future-Proof Design Of IA-DU . . . . . . . . +7.2.2.1 The ‘Ultimate’ DU Dataset? . . . . . +7.2.2.2 A Feature-complete IA-DU Solution? + +. +. +. +. +. +. + +. +. +. +. +. +. + +. +. +. +. +. +. + +. +. +. +. +. +. + +. +. +. +. +. +. + +165 +165 +171 +172 +173 +173 +178 + +Bibliography + +. +. +. +. +. +. +. +. +. +. + +181 + +A Appendix - PUQ +223 +A +Implementation Details . . . . . . . . . . . . . . . . . . . . . . 223 + + \ No newline at end of file diff --git a/assets/txts/pg_0021.txt b/assets/txts/pg_0021.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e8d5f09dc4d595e82b75ae821605890ac7ea717 --- /dev/null +++ b/assets/txts/pg_0021.txt @@ -0,0 +1,421 @@ +CONTENTS + +B +C + +xvii + +A.1 +Software and Data . . . . . . . . . . +A.2 +Hyperparameter Defaults . . . . . . +Practical Considerations . . . . . . . . . . . +B.1 +Take-home Summary . . . . . . . . . +B.2 +Compute vs. Performance Trade-off +Detailed Experiment Results . . . . . . . . +C.1 +Zoom-in Benchmark Evidence . . . . +C.2 +Absolute Benchmark Results . . . . + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. + +223 +223 +224 +224 +225 +226 +226 +226 + +B Appendix - BDPC +230 +A +Existing DC Datasets . . . . . . . . . . . . . . . . . . . . . . . . 230 +B +Visualization of Proposed DC Datasets . . . . . . . . . . . . . . 231 +C Appendix - DUDE +A +Baseline Experiments Setup . . . . . . . . . . +A.1 +Hyperparameter Defaults . . . . . . . +A.2 +Generative LLM Prompt Fine-tuning +A.3 +Confidence Estimation . . . . . . . . . +A.4 +Evaluation . . . . . . . . . . . . . . . +B +Qualitative Examples . . . . . . . . . . . . . +B.1 +Qualitative Examples - Competition . + +. +. +. +. +. +. +. + +. +. +. +. +. +. +. + +. +. +. +. +. +. +. + +. +. +. +. +. +. +. + +. +. +. +. +. +. +. + +. +. +. +. +. +. +. + +. +. +. +. +. +. +. + +. +. +. +. +. +. +. + +. +. +. +. +. +. +. + +. +. +. +. +. +. +. + +232 +232 +232 +232 +233 +235 +235 +241 + +D Appendix - KDD +A +Code and Datasets . . . . . . . . . . . +B +Implementation Details . . . . . . . . +C +Task Definitions . . . . . . . . . . . . +D +Additional Experiment Results . . . . +D.1 +Tobacco-3482 Results . . . . . +D.2 +PRImA Results . . . . . . . . . +D.3 +RVL-CDIP-N Results . . . . . +D.4 +Downstream DocVQA Results +D.5 +Ablation Experiments . . . . . + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +244 +244 +244 +246 +247 +249 +249 +249 +249 +249 + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. + +Curriculum + +253 + +Publications + +255 + + \ No newline at end of file diff --git a/assets/txts/pg_0033.txt b/assets/txts/pg_0033.txt new file mode 100644 index 0000000000000000000000000000000000000000..57094c296cbbaf9ba200a76cba188d7cd8ef485d --- /dev/null +++ b/assets/txts/pg_0033.txt @@ -0,0 +1,30 @@ +Chapter 1 + +Introduction +“yourAmid +significant life events—like buying a house or expecting +firstborn child—lies a less cheerful reality that I experienced +firsthand: the hassle of dealing with manual paperwork. + +For the former case, this required a lot of back-and-forth with +the bank, the notary, and the real estate agent, with each of +them requiring a different set of documents (e.g., monthly pay +stubs, bank statements, copies of national registry, etc.) to be +filled in, signed, and sent back for processing. +On the side of the document processors, each document needed +to be classified, key information extracted, and the information +validated against other documents to be able to prove my +solvency in making an offer, applying for a loan, or being drafted +as the future house owner. In between all parties and external +organizations, even more documents were either created, adapted, +or passed along such as the offer, the loan agreement, the deed +of sale, a soil certificate, etc. +This juxtaposition of valuable moments in life with cumbersome +administrative procedures involving manual document +processing forms the backdrop against which I aim to explore +and propose potential solutions in this thesis. + +” +1 + + \ No newline at end of file diff --git a/assets/txts/pg_0034.txt b/assets/txts/pg_0034.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9c2d135877ee5ba3d879253a8818af58f115f27 --- /dev/null +++ b/assets/txts/pg_0034.txt @@ -0,0 +1,44 @@ +2 + +INTRODUCTION + +Documents are containers of information that are easily shareable. The concept +of a document dates back to when humans started writing and has been a +cornerstone of human communication ever since. In the age of digital technology, +documents are still the primary means of communication between humans and +organizations and form the backbone of many business processes. Human +communication is increasingly happening through digital channels, and the +COVID-19 pandemic has only accelerated this trend. We are increasingly living +in a “document society” [53], dependent on documents in our daily lives or for +recording second-hand knowledge. With instant gratification as the norm in +the digital age, people expect similar seamless interactions with businesses and +governments. While digitization has increased the speed and ease of documentbased communication, document processing remains a largely human effort with +organizations drowning under the sheer volume of documents they receive. +So why have organizations not switched en masse to +automated document processing? +The answer lies for some part in (I) the complexity of the task, and for the +other part in (II) the need for reliability and risk control. +(I) While it might be straightforward for a human (white-collar) worker to read +a long, structured document, understand its contents, categorize it, and extract +crucial information accordingly, this is not so easy for a machine. This could be +perceived as an instance of Moravec’s paradox [319], which states that tasks +that are easy for humans are hard for machines, and vice versa. However, in +recent times, significant strides forward have been made thanks to technological +advances combining Natural Language Processing (NLP), Computer Vision +(CV) and Machine Learning (ML). Document Understanding (DU) is +the umbrella term for both the end-to-end solution and the research field +studying to make machines interpret and understand documents (elaborated +on in Section 2.3). It has seen a surge in interest in the past few years, with +the rise of large-scale pretrained Language and Vision models (LLM, VLM) +[52, 94, 101, 187, 380, 383, 502] capable of modeling document inputs. +What makes DU challenging is that it encompasses multiple subtasks, each of +which is a research field in its own right, such as Optical Character Recognition +(OCR), Document Layout Analysis (DLA), Document Classification (DC), Key +Information Extraction (KIE), Visual Question Answering (VQA), etc. The +complexity of the task is further increased by the fact that documents are +multimodal, containing both text and images and that they are compositional, +i.e., the meaning of the document is not just the sum of its parts. Information +can appear in a wide range of forms including text, images, tables or graphs, +and be spread across multiple pages. Moreover, the meaning of a document + + \ No newline at end of file diff --git a/assets/txts/pg_0035.txt b/assets/txts/pg_0035.txt new file mode 100644 index 0000000000000000000000000000000000000000..236c98703711a3f2beeba1724b9a4950bd424684 --- /dev/null +++ b/assets/txts/pg_0035.txt @@ -0,0 +1,46 @@ +INTRODUCTION + +3 + +can change depending on the context in which it is used. As an artifact of the +communication channel, not all documents are born digitally, and the quality +of the document can vary greatly, with some documents being handwritten, +scanned with low resolution, or even a picture of a document. Furthermore, +documents are often not standardized templates and can be highly variable in +terms of layout, structure, and content. Finally, the longer the document, the +more computationally demanding it becomes to process, and the more likely it +is to induce errors, which can be harder to detect. +Addressing the inherent challenges of document processing, and achieving high +levels of accuracy, processing speed, reliability, robustness, and scalability in +DU forms the applied scope of this thesis. +(II) Consider the example given of the birth certificate. While I might not +appreciate as much the manual handling of this document, if they had registered +my baby girl’s name (Feliz, Spanish writing without an accent on the ‘e’) +incorrectly, I would be pretty upset as this could have further repercussions. +Whereas this error might be easily rectified, it is not so easy to do so in the +case of a mortgage application, where the wrong information could lead to a +rejection of the application, or even worse, a loan agreement with the wrong +terms and conditions. This demonstrates that, even when full automation of +document processing is in high demand, it is not always desirable if the risk of +failure might be too large. +Nevertheless, a lot of the potential for automation remains untapped, and +organizations are increasingly looking for solutions to fully automate their +document processing workflows. However, full automation, implying perfect +recognition of document categories and impeccable information extraction is an +unattainable goal with the current state of technology [79]. +The more realistic objective set is Intelligent Automation (IA) (elaborated +on in Section 2.4), where the goal is to have the machine estimate confidence +in its predictions, deriving business value with as high as possible volumes of +perfect predictions (Straight-Through-Processing, STP) without incurring extra +costs (False Positives, FP). +The leitmotif of this thesis will be the fundamental enablers of IA: confidence +estimation and failure prediction. +Calibrated uncertainty estimation with efficient and effective DU technology +will allow organizations to confidently automate their document processing +workflow, while keeping a human in the loop only for predictions with a higher +likelihood of being wrong. To date, however, little research has addressed the +question of how to make DU technology more reliable, as is illustrated in a toy +analysis (Table 1.1) reporting the absence of many IA-related keywords in the +Proceedings of the 2021 International Conference on Document Analysis and + + \ No newline at end of file diff --git a/assets/txts/pg_0036.txt b/assets/txts/pg_0036.txt new file mode 100644 index 0000000000000000000000000000000000000000..19747cca63d44df370b862437a8c601a3f946800 --- /dev/null +++ b/assets/txts/pg_0036.txt @@ -0,0 +1,80 @@ +4 + +INTRODUCTION + +Recognition (ICDAR) [289]. +The thesis aims to fill this gap by proposing novel methods for uncertainty +estimation and failure prediction (Part I), and by providing a framework for +benchmarking and evaluating the reliability and robustness of DU technology, +as close as possible to real-world requirements (Part II). +Table 1.1. Comparative analysis of keywords in the ICDAR 2021 proceedings. While +many DU subtasks are represented, there is a lack of keywords related to IA. Do note +that calibration is used in the context of camera calibration, and not in the context of +confidence estimation. +keyword + +freq + +keyword + +freq + +document +classification + +3388 +242 + +33 +0 + +key information + +56 + +question answering + +106 + +layout analysis + +223 + +calibration/calibrate +temperature scaling +failure prediction +misclassification detection +out-of-distribution +OOD +predictive uncertainty + +0 +25 +0 + +In the remainder of the Introduction, I will sketch the surrounding research +context, followed by the problem statement and research questions, and finally +the outline of the thesis manuscript. + +1.1 + +Research Context + +All chapters of this dissertation have been executed as part of the Baekeland +PhD mandate (HBC.2019.2604) with financial support of VLAIO (Flemish +Innovation & Entrepreneurship) and Contract.fit. The latter is a Belgian-based +software-as-a-service (SaaS) provider of Intelligent Document Processing (IDP) +drawing on innovations in DU to power their product suite (email-routing, +Parble), and my generous employer since 2017. +Some of the joint work (Chapter 5) has been partially funded by a PhD +Scholarship from AGAUR (2023 FI-3-00223), and the Smart Growth Operational +Programme under projects no. POIR.01.01.01-00-1624/20 (Hiper-OCR - an +innovative solution for information extraction from scanned documents) and +POIR.01.01.01-00-0605/19 (Disruptive adoption of Neural Language Modelling +for automation of text-intensive work). +Moreover, given that the dissertation work has been performed over a large +span of time, it warrants putting it in the larger context and dynamics of AI +innovations, the state of DU as a field, how notions of ’reliability’ have evolved +over time, and finally the business context. + + \ No newline at end of file diff --git a/assets/txts/pg_0037.txt b/assets/txts/pg_0037.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e98d1b8513fc389c48dd03f28fdfd521ee8dafd --- /dev/null +++ b/assets/txts/pg_0037.txt @@ -0,0 +1,42 @@ +RESEARCH CONTEXT + +5 + +This thesis started almost concurrently with the rise of the global COVID19 pandemic, making it hard to foster collaborations in the early stages. At +the start of the PhD, DU methodology was fairly established, with OCR and +Transformer-based pipelines such as BERT [94] and LayoutLM [502], which +is why we first prioritized the more fundamental challenge of decision-making +under uncertainty (Part I); which was followed by a step back, closer to applied +DU research (Part II). +The research community’s understanding of ‘reliability’ has also evolved over +time. When starting the work of Chapter 3, the notion of reliability was mostly +associated with uncertainty quantification and calibration. However, calibration +is not a panacea, and only fairly recently, Jaeger et al. [193] proposed a more +general framework encapsulating reliability and robustness. They promote the +more concrete and useful notion of failure prediction, which still involves +confidence/uncertainty estimation yet with an explicit definition of the failure +source which one wants to detect or guard against, e.g., in-domain test errors, +changing input feature distributions, novel class shifts, etc. Since I share a +similar view of the problem, I have focused following works on the more general +notion of failure prediction, which is also more in line with the business context +of IA. +Whereas we originally intended to work on multi-task learning of DU subtasks, +the rise of general-purpose LLMs offering a natural language interface to +documents rather than discriminative modeling (e.g., ChatGPT [52, 344]), +prompted us toward evaluating this promising technology in the context of +DU. More importantly, we observed the lack of sufficiently complex datasets +and benchmarks in DU that would allow us to tackle larger, more fundamental +questions such as ’Do text-only LLMs suffice for most low-level DU subtasks?’ +(subsequently tackled in Chapter 5), which is why we shifted our focus to the +more applied research questions of benchmarking and evaluation (Part II). +Finally, the business context has also evolved over time. Originally, IDP was +practiced by legacy OCR companies; specialized vendors, offering a range of +solutions for specific document types (e.g., invoices, contracts, tax forms, etc.); +or cloud service providers, offering IDP as part of a larger suite of services +(e.g., AWS Textract, Azure Form Recognizer, etc.). However, the rise of both +open-source LLM development and powerful, though closed-source models has +lowered the barrier to entry for any new entrants or incumbents. This has led +to a commoditization of IDP, with the quality of the LLMs and the ease of +integration with existing business processes becoming key differentiators. + + \ No newline at end of file diff --git a/assets/txts/pg_0038.txt b/assets/txts/pg_0038.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f8b0de8c96bc1502b9dbd808025374cfe0728da --- /dev/null +++ b/assets/txts/pg_0038.txt @@ -0,0 +1,45 @@ +6 + +1.2 + +INTRODUCTION + +Problem Statement and Questions + +The general introduction sketches the context of the research, and motivates +the research questions. In this Section, I will formulate the problem statement +and research questions more formally and how they relate to the manuscript’s +contents. + +1.2.1 + +Reliable and Robust Deep Learning + +The dissertation opens with the more fundamental challenge of targeting +reliability and robustness in Deep Learning, which covers fairly abstract concepts +that have been used interchangeably and inconsistently in the literature. They +will be defined more extensively in Section 2.2, but for now, consider reliability +as the ability to avoid failure, robustness as the ability to resist failure, and +resilience as the ability to recover from failure [373, 438, 455]. In Chapter 3, we +focus on the more concrete objective of predictive uncertainty quantification +(PUQ), which shows promise for improving reliability and robustness in Deep +Learning (DL) [123, 140, 173, 455]. Concretely, PUQ methods are expected to +elucidate sources of uncertainty such as a model’s lack of in-domain knowledge +due to either training data scarcity or model misspecification, or its ability to +flag potentially noisy, shifted or unknown input data [136]. +We observed that the majority of prior PUQ research focused on regression and +CV tasks, while the applicability of PUQ methods had not been thoroughly +explored in the context of NLP. As mentioned earlier, most DU pipelines (in +2020) were text-centric with a high dependency on the quality of OCR. Since +OCR is often considered a solved problem [262], we hypothesized that the main +source of error and uncertainty in DU would reside in the text representations +learned by deep neural networks (DNN)s. This is why we focused on the +more fundamental question of how well do PUQ methods scale in NLP? More +specifically, we restricted the scope to the prototypical, well-studied task of +text classification, for which we could leverage existing multi-domain datasets +varying in complexity, size and label space (multi-class vs. multi-label). +This leads to the following research questions: +RQ 1. When tested in realistic language data distributions on various text +classification tasks, how well do PUQ methods fare in NLP? + + \ No newline at end of file diff --git a/assets/txts/pg_0039.txt b/assets/txts/pg_0039.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e982e5ac139ad4d5fcbace54f49f5db0d7753d8 --- /dev/null +++ b/assets/txts/pg_0039.txt @@ -0,0 +1,41 @@ +PROBLEM STATEMENT AND QUESTIONS + +7 + +RQ 2. In which settings are PUQ methods most useful, i.e., which failure sources +/ distribution shifts are they most sensitive to? +RQ 3. How can we obtain better PUQ estimates without overrelying on +computationally prohibitive methods, e.g., Deep Ensemble [238]? +RQ 4. How important are certain prior, neural architecture or hyperparameter +influences on the quality of PUQ estimation? +In a later chapter (Chapter 5), we introduce a complex benchmark for generic +DU that additionally tests for robustness to domain, visual and layout shifts, +and explores the novel problem of hallucination and control in natural language +generation (NLG) with LLMs from the perspective of calibrated and selective +DocVQA. The general task formulation involves a natural language question (on +content, aspect, form, visual/layout), an input document, and a set of reference +answers. The model is expected to provide a natural language answer, an answer +confidence and a (binary) abstention decision. Evaluation is done in terms of +answer correctness, calibration and selective prediction. On the one hand, one +expects a model to lower confidence when unsure about the correctness of a +predicted answer. On the other hand, one expects a model to abstain from +answering and refrain from hallucinations on unanswerable questions (which +had been explicitly added in the dataset). +RQ 5. How severe is the problem of hallucination and control in LLMs when +evaluated in a selective, free-form DocVQA task setting? + +1.2.2 + +Realistic and Efficient Document Understanding + +The second part of the dissertation focuses on the more applied research questions +of realistic and efficient DU. The overall objective is to make DU technology +more generically applicable (Chapter 5), evaluation more in sync with real-world +requirements (Chapters 4 and 5), and more efficient at modeling the multimodal +and compositional nature of documents (Chapters 5 and 6). +Due to the proximity to business applications and the risks of leaking personal +information, DU research benchmarks have diverged substantially from the +real-world distributions of document data. For instance, DU datasets are often +limited to single-page document images, are from outdated sources (e.g., IIT- + + \ No newline at end of file diff --git a/assets/txts/pg_0040.txt b/assets/txts/pg_0040.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc4a62691bd1c7887f28bbb0c32878d6a1334e0b --- /dev/null +++ b/assets/txts/pg_0040.txt @@ -0,0 +1,35 @@ +8 + +INTRODUCTION + +CDIP [252]), or are restricted to a single domain or a small set of document +types. +We posit that larger, fundamental questions in DU remain unanswered due to a +lack of sufficiently complex datasets and benchmarks with a rich methodology +covering evaluation beyond the independent and identically distributed (i.i.d.) +test set setting. While there exist performant models for DU subtasks such +as OCR, DC, KIE, etc., it is unclear how to move from these specific analysis +and recognition tasks to models that can reason and understand documents. A +truly end-to-end DU solution must handle the complexity and variety of realworld documents and subtasks, which could be expressed as natural language +questions. Moreover, it should be able to generalize to any question on any +document and reason over multiple pages and modalities. +The following research questions are addressed in Chapters 4 and 5: +RQ 6. How can we iteratively close the gap between research and practice in DU? +RQ 7. How can we design a resource that comprehensively challenges the state-ofthe-art? +RQ 8. Which DU aspects are most challenging for current state-of-the-art LLMs? +How can these be incorporated in a benchmark to allow proper measurements +of future improvements? +However, moving the goalpost beyond a single-page context inevitably requires +us to reconsider the research challenge of efficiency in DU. The rise of LLMs +has enabled a new generation of DU pipelines, which are more flexible and +easier to maintain than separate and specialized subtask modules, but also +more computationally demanding. Importantly, most LLMs are not designed +to handle the multimodality and long context windows of multipage documents, +and are often unaware of the visual and layout semantics of documents. +The research questions for Chapter 6 address the efficiency challenge in DU: +RQ 9. How can we efficiently infuse LLMs with semantic layout awareness for +more focused information extraction? +RQ 10. To what degree can model compression resolve the problem of efficiency +in processing documents? + + \ No newline at end of file diff --git a/assets/txts/pg_0041.txt b/assets/txts/pg_0041.txt new file mode 100644 index 0000000000000000000000000000000000000000..8931c41526dc7adef34baac4ef2402763feeb09d --- /dev/null +++ b/assets/txts/pg_0041.txt @@ -0,0 +1,27 @@ +OUTLINE + +1.3 + +9 + +Outline + +Figure 1.1. Overview of publications and how they relate to the chapters. + +Figure 1.2. Visual Overview of the research questions and how they relate to the +chapters. + +After the introductory Chapters 1 and 2, we continue with the publication-based +chapters that form the core of the thesis, which are structured in two parts. +Part I consists of a single chapter, Chapter 3, which presents a benchmarking +study of PUQ methods applied on real-world text classification datasets with +1-D convolutional neural networks and pretrained transformers. It motivates +a novel PUQ method, Deep Ensemble with Concrete Dropout, combining the +benefits of both methods, and showing promise for improving reliability and +robustness in NLP at a lower computational cost. The chapter concludes with +a discussion of the results, including targeted ablation studies, and provides +recommendations for future research. +Part II consists of three chapters, Chapters 4 to 6, which all focus on the more +applied research questions of realistic and efficient DU. + + \ No newline at end of file diff --git a/assets/txts/pg_0042.txt b/assets/txts/pg_0042.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7e2d484b3d991675fe433215940bb7e88c25708 --- /dev/null +++ b/assets/txts/pg_0042.txt @@ -0,0 +1,31 @@ +10 + +INTRODUCTION + +Chapter 4 reflects on the current state of DU research, and proposes guidelines to +foster document dataset construction efforts. It introduces two novel document +classification datasets, RVL-CDIP_MP and RVL-CDIP-N_MP, as extensions +of the RVL-CDIP dataset [165] with multipage documents. The datasets are +accompanied by a comprehensive experimental analysis, which shows promise +from advancing multipage document representations and inference. +Chapter 5 introduces the multi-faceted DUDE +benchmark for assessing +generic DU, that was also hosted as a competition to challenge the DU +community. It describes the complete methodology and design of the dataset, +targeting model innovations that can handle the complexity and variety of +real-world documents and subtasks, and generalize to any documents and any +questions. Next to a discussion of the competition results, it also presents +our own comprehensive benchmarking study of SOTA LLMs with varying the +context length and what modalities are represented. +Chapter 6 investigates how to efficiently obtain more semantic document layout +awareness. We explore what affects the teacher-student knowledge gap in +KD-based model compression methods, and design a downstream task setup +to evaluate the robustness of distilled DLA models on zero-shot layout-aware +DocVQA. +Finally, Chapter 7 concludes the thesis with a summary of the main contributions +(Section 7.1), and a discussion of future research directions. As a logical followup to Chapter 5, we propose in Section 7.2.2.1 how the DUDE dataset could +be extended to become the ‘ultimate’ DU benchmark. The thesis ends with a +hypothetical, informed design of how the research presented would form part of +an end-to-end, fully-fledged IA-DU solution (Section 7.2.2.2). + + \ No newline at end of file diff --git a/assets/txts/pg_0043.txt b/assets/txts/pg_0043.txt new file mode 100644 index 0000000000000000000000000000000000000000..36494ff567857e69aa11d563bca196f78d67b2c6 --- /dev/null +++ b/assets/txts/pg_0043.txt @@ -0,0 +1,32 @@ +Chapter 2 + +Fundamentals +This chapter provides all the necessary background knowledge necessary to +understand the contributions of this thesis. +The key questions covered here are: +i. +ii. +iii. +iv. +v. +vi. + +How to feed a document to an algorithm to perform arbitrary tasks on it? +How to model language, vision, layout or structure? +How does it learn and then operate at inference time? +How does it estimate prediction uncertainty? +How to evaluate its performance? +How to integrate it as a useful, end-to-end system in a document workflow? + +Section 2.1 explains the basic setting from the perspective of statistical learning +theory [472], which is a mathematical framework for analyzing how algorithms +learn from data with minimal error. Section 2.2 gives a primer on reliability and +robustness, particularly calibration, failure detection and relevant evaluation +metrics. Section 2.3 surveys the DU field, and discusses the state of the art in +DU technology. Finally, Section 2.4 covers Intelligent Automation to illustrate +how solving the challenges posed in this thesis will enable to augment human +intelligence, creativity and productivity in straight-through business processes. + +11 + + \ No newline at end of file diff --git a/assets/txts/pg_0044.txt b/assets/txts/pg_0044.txt new file mode 100644 index 0000000000000000000000000000000000000000..704dc7be0a638d940005f74cebb080df6d5fba1f --- /dev/null +++ b/assets/txts/pg_0044.txt @@ -0,0 +1,163 @@ +12 + +FUNDAMENTALS + +Contents +2.1 + +2.2 + +2.3 + +2.4 + +2.1 + +Statistical Learning - basics . . . . . . . . . . . . +2.1.1 Neural Networks . . . . . . . . . . . . . +2.1.2 Probabilistic Evaluation . . . . . . . . . +2.1.3 Architectures . . . . . . . . . . . . . . . +Reliability and Robustness . . . . . . . . . . . . +2.2.1 Generalization and Adaptation . . . . . +2.2.2 Confidence Estimation . . . . . . . . . . +2.2.3 Evaluation Metrics . . . . . . . . . . . . +2.2.4 Calibration . . . . . . . . . . . . . . . . +2.2.5 Predictive Uncertainty Quantification . . +2.2.6 Failure Prediction . . . . . . . . . . . . . +Document Understanding . . . . . . . . . . . . . +2.3.1 Task Definitions . . . . . . . . . . . . . . +2.3.2 Datasets . . . . . . . . . . . . . . . . . . +2.3.3 Models . . . . . . . . . . . . . . . . . . . +2.3.4 Challenges in Document Understanding +Intelligent Automation . . . . . . . . . . . . . . + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. +. + +12 +14 +15 +17 +18 +19 +20 +21 +25 +27 +29 +30 +31 +33 +34 +35 +38 + +Statistical Learning + +Two popular definitions of Machine Learning (ML) are given below. +Machine Learning is the field of study that gives computers the ability +to learn without being explicitly programmed. [406] +A computer program is said to learn from experience E with respect to +some class of tasks T, and performance measure P, if its performance +at tasks in T, as measured by P, improves with experience E. [317] +Following these, different types of learning problems [472] can be discerned, of +which the most common (and the one used throughout our works) is supervised +learning. It defines experience E as a set of input-output pairs for which the +task T is to learn a mapping f from inputs X ∈ X to outputs Y ∈ Y, and the +performance measure P is the risk or expected loss (Equation (2.1)), given a +(0-1) loss function ` : Y × Y → R+ . +R(f ) = E(X,Y )∼P [`(Y, f (X))] + +(2.1) + +The mapping f (·; θ) : X → Y is typically parameterized by a set of parameters +θ (omitted whenever it is fixed) and a hypothesis class F, which is a set of + + \ No newline at end of file diff --git a/assets/txts/pg_0045.txt b/assets/txts/pg_0045.txt new file mode 100644 index 0000000000000000000000000000000000000000..3afb663ca09e5aa63b40814121f6a0773a88264d --- /dev/null +++ b/assets/txts/pg_0045.txt @@ -0,0 +1,53 @@ +STATISTICAL LEARNING + +13 + +possible functions. The objective is to find a function f ∈ F that minimizes the +risk, or even better, the Bayes risk +f ∗ = inf R(f ), +f ∈F + +(2.2) + +which is the minimum achievable risk over all functions in F. The latter is only +realizable with infinite data or having access to the data-generating distribution +P(X , Y). In practice, Equation (2.2) is unknown, and the goal is to find a +function fˆ that minimizes the empirical risk +N +1 X +`(yi , f (xi )), +fˆ = +N i=1 + +(2.3) + +where (xi , yi ) are N independently and identically distributed (i.i.d.) samples +drawn from an unknown distribution P on X × Y. This is known as empirical +risk minimization (ERM), which is a popular approach to supervised learning, +under which three important processes are defined. +Training or model fitting is the process of estimating the parameters θ of a +model, which is done by minimizing a suitable loss function ` over a training +set D = {(xi , yi )}N +i=1 of N i.i.d. samples. +Inference or prediction is the process of estimating the output of a model for +a given input, which is typically done by computing the posterior probability +P (y|x) over the output space Y. Classification output is a discrete label, while +regression output is a continuous value. +Evaluation involves measuring the quality of a model’s predictions, which is +typically done by computing a suitable evaluation metric over a test set Dtest +of i.i.d. samples, which were not used for training. +However, ERM has its caveats concerning generalization to unseen data, +requiring either additional assumptions on the hypothesis class F, which +are known as inductive biases, and/or regularization to penalize the +complexity of the function class F [445]. In neural networks (discussed in +detail Section 2.1.1), the former is controlled by the architecture of the network, +while the latter involves specifying constraints to parameters or adding a +regularization term to the loss function. + + +fˆ = arg min R̂(f ) + λΨ(θ) +f ∈F + +(2.4) + + \ No newline at end of file diff --git a/assets/txts/pg_0046.txt b/assets/txts/pg_0046.txt new file mode 100644 index 0000000000000000000000000000000000000000..07884809c8a3e46062eae9a6696f8eb9953f830d --- /dev/null +++ b/assets/txts/pg_0046.txt @@ -0,0 +1,47 @@ +14 + +FUNDAMENTALS + +Equation (2.4) defines regularized empirical risk minimization (RERM), +where Ψ(θ) is a regularization term and λ is a hyperparameter that controls the +trade-off between the empirical risk (denoted with R̂) and the regularization +term. +All these concepts will be revisited in the context of neural networks in +Section 2.1.1, where we will also discuss the optimization process of the model +parameters θ, how inference differs in the case of probabilistic models to estimate +uncertainty (Section 2.2.5), and how regularization affects confidence estimation +and calibration (Section 2.2.4). + +2.1.1 + +Neural Networks + +An artificial neural network (NN) is a mathematical approximation inspired +by data processing in the human brain [396]. It can be represented by a +network topology of interconnected neurons that are organized in layers that +successively refine intermediately learned feature representations of the input +[448] that are useful for the task at hand, e.g., classifying an animal by means +of its size, shape and fur, or detecting the sentiment of a review by focusing on +adjectives. +A basic NN building block is a linear layer, which is a linear function of the +input parameters: f (x) = W x + b, where the bias term b is a constant vector +shifting the decision boundary away from the origin and the weight matrix +W holds most parameters that rotate the decision boundary in input space. +Activation functions (e.g., tanh, ReLu, sigmoid, softmax, GeLu) are used to +introduce non-linearity in the model, which is required for learning complex +functions. +The first deep learning (DL) network (stacking multiple linear layers) dates +back to 1965 [191], yet the term ‘Deep Learning’ was coined in 1986 [398]. +The first successful DL application was a demonstration of digit recognition +in 1998 [244], followed by DL for CV [90, 223] and NLP [76]. The recent +success of DL is attributed to the availability of large datasets, the increase in +computational power, the development of new algorithms and architectures, +and the commercial interest of large companies. +Consider a conventional DL architecture as a composition of parameterized +functions. Each consists of a configuration of layers (e.g., convolution, pooling, +activation function, normalization, embeddings) determining the type of input +transformation (e.g., convolutional, recurrent, attention) with (trainable) +parameters linear/non-linear w.r.t. the input x. Given the type of input, +e.g., language which is naturally discrete-sequential, or vision which presents a + + \ No newline at end of file diff --git a/assets/txts/pg_0047.txt b/assets/txts/pg_0047.txt new file mode 100644 index 0000000000000000000000000000000000000000..95df88d3ea4797f9b22ba3aafdfa7f60033ca999 --- /dev/null +++ b/assets/txts/pg_0047.txt @@ -0,0 +1,53 @@ +STATISTICAL LEARNING + +15 + +Sigmoid Function +1 +σ(z) = +1 + exp−z + +Softmax Function +exp(z) +softmax(z) = PK +k=1 exp(zk ) + +Table 2.1. Sigmoid and softmax activation functions for binary and multi-class +classification, respectively. + +ready continuous-spatial signal, different DL architectures have been established, +which will be discussed in Section 2.1.3. +A K-class classification function with an l-layer NN with d dimensional input x ∈ +Rd is shorthand fθ : Rd → RK , with θ = {θj }lj=1 assumed to be optimized, either +partially or fully, using backpropagation and a loss function. More specifically, +it presents a non-convex optimization problem, concerning multiple feasible +regions with multiple locally optimal points within each. With maximumlikelihood estimation estimation, the goal is to find the optimal parameters +or weights that minimize the loss function, effectively interpolating the training +data. This process involves traversing the high-dimensional loss landscape. +Upon convergence of model training, the optimized parameters form a solution +in the weight-space, representing a unique mode (specific function fθ̂ ). However, +when regularization techniques such as weight decay, dropout, or early stopping +are applied, the objective shifts towards maximum-a-posteriori (MAP), to +take into account the prior probability of the parameters. The difference in +parameter estimation forms the basis for several uncertainty estimation methods, +covered in Section 2.2.5. +A prediction is a translation of a model’s output to which a standard decision +rule is applied, e.g., to obtain the top-1/k prediction (Equation (2.5)), or decode +structured output according to a function maximizing total likelihood with +optionally additional diversity criteria. +ŷ = argmax fθ̂ (x) + +(2.5) + +Considering standard NNs, the last layer outputs a vector of real-valued logits +z ∈ RK , which in turn are normalized to a probability distribution over K +classes using a sigmoid or softmax function (Table 2.1). + +2.1.2 + +Probabilistic Evaluation + +The majority of our works involves supervised learning with NNs, formulated +generically as a probabilistic predictor in Definition 1. + + \ No newline at end of file diff --git a/assets/txts/pg_0048.txt b/assets/txts/pg_0048.txt new file mode 100644 index 0000000000000000000000000000000000000000..45036e3b02241e313a1b6480f5de53b3d5cf0636 --- /dev/null +++ b/assets/txts/pg_0048.txt @@ -0,0 +1,45 @@ +16 + +FUNDAMENTALS + +Definition 1. Probabilistic predictor f : X → ∆Y that outputs a conditional +probability distribution P (y 0 |x) over outputs y 0 ∈ Y for an i.i.d. drawn sample +(x,y). +|Y| + +Definition 2 (Probability Simplex). Let ∆Y := {v ∈ R≥0 : kvk1 = 1} be a +probability simplex of size |Y| − 1 as a geometric representation of a probability +space, where each vertex represents a mutually exclusive label and each point +has an associated probability vector v [368]. +Figure 2.1 illustrates a multi-class classifier, where Y = [K] for K=3 classes. +photos.google.com + +Google Photos +Home for all your photos and videos, +automatically organized and easy to +share. + +https://photos.google.com/search/fox + +Figure 2.1. Scatter plot of a ternary problem (K = 3, N = 100) in the probability +simplex space. Example of overconfident misprediction (above is a Shiba Inu dog) and +correct sharp prediction (clear image of Beagle). + +In practice, loss functions are proper scoring rules [330], S : ∆Y × Y → R, that +measure the quality of a probabilistic prediction P (ŷ|x) given the true label y. +The cross-entropy (CE) loss is a popular loss function for classification, while +the mean-squared error (MSE) loss is used for regression. In Section 2.2, we +will discuss the evaluation of probabilistic predictors in more detail, including +the calibration of confidence estimates and the detection of out-of-distribution +samples. + +2.1.3 + +Architectures + +Throughout the chapters of the thesis, we have primarily used the following +NN architectures: Convolutional Neural Networks (CNNs), Transformer +Networks . We will briefly introduce the building blocks of these architectures, +with a focus on how they are used in the context of document understanding. + + \ No newline at end of file diff --git a/assets/txts/pg_0049.txt b/assets/txts/pg_0049.txt new file mode 100644 index 0000000000000000000000000000000000000000..21180eaa1fcd8ce1b0a1484273e9eaf9ee6848b5 --- /dev/null +++ b/assets/txts/pg_0049.txt @@ -0,0 +1,41 @@ +STATISTICAL LEARNING + +2.1.3.1 + +17 + +Convolutional Neural Networks + +Convolutional Neural Networks (CNNs) [244] are a class of DNNs designed +primarily for visual and grid-spatial data such as images. They are inspired by +the visual cortex of animals, which contains neurons that are sensitive to small +subregions of the visual field, called a receptive field. The receptive fields of +different neurons partially overlap such that they cover the entire visual field, +growing larger in deeper layers of the visual cortex. + +Figure 2.2. Sketch of a CNN architecture. The input is a 2D image, which is iteratively +convolved with a set of learned filters detecting specific input features, e.g., edges, +corners, blobs, to produce feature maps. Feature maps are then downsampled using +a pooling operation. + +As illustrated in Figure 2.2, CNNs are composed of multiple convolutional layers, +which hierarchically extract features from the input, followed by pooling and +fully-connected layers to classify the input based on the downsampled features. +A filter K ∈ Rd×d is a rectangular matrix of trainable weights with width and +height d typically smaller than the input x. A convolutional layer applies filters +sliding over the input, with each filter producing a feature map: +F = K ∗ x, + +(2.6) + +where the convolution operation ∗ computes a dot product between filter entries +and the covered portions of the input. +Thanks to the weight sharing property of the convolution operation, CNNs are +able to learn translation invariance, i.e., the ability to recognize an object +regardless of its position in the image. This is particularly useful for object +detection, where the position of the object in the image is unknown. +This architecture was used for document image classification and document +layout analysis (Section 6.3.2). A special version is 1-D CNNs, which we applied +to one-hot encoded text data in text classification benchmarking (Section 3.4.3). + + \ No newline at end of file diff --git a/assets/txts/pg_0050.txt b/assets/txts/pg_0050.txt new file mode 100644 index 0000000000000000000000000000000000000000..050d62e7b767be3a0eb78d1b0c141afe1dc45d36 --- /dev/null +++ b/assets/txts/pg_0050.txt @@ -0,0 +1,46 @@ +18 + +2.1.3.2 + +FUNDAMENTALS + +Language Neural Networks + +The first step to represent language input into a format compatible with NNs is +to convert units of language, words or characters or “tokens” as depending on +a tokenizer, into numerical vectors. This is done by means of embeddings, +which are typically learned as part of the training process, and are used to +represent the meaning of words in a continuous vector space. There have been +multiple generations of word embeddings, starting with one-hot vectors that +represent each word by a vector of zeros with a single one at its vocabulary index, +which depends highly on the tokenizer used and does not capture semantic +relationships between words. Alternatives are frequency-based embeddings, +such as TF-IDF vectors, which represent each word by its frequency in the +corpus, weighted by its inverse frequency in the corpus, capturing some lexical +semantics, but not the context in which the word appears. The next generation +are Word2Vec embeddings that are trained to predict the context of a word, i.e., +the words that appear before and after it in a sentence. FastText embeddings +improve this by considering a character n-gram context, i.e., a sequence of n +characters. The current generation are contextual word embeddings that +are trained to predict the context of a word, taking into account the surrounding +context and learning the sense of a word based on its context, e.g., ‘bank’ as +a river bank vs. a financial institution in ‘Feliz sits at the bank of the river +Nete’. Another important innovation is subword tokenization to deal with +the out-of-vocabulary (OOV) problem, which is particularly relevant for +morphologically rich languages, such as Dutch, where word meaning can be +inferred from its subwords. A clever extension is byte pair encoding (BPE) +[412], which is a data compression algorithm that iteratively replaces the most +frequent pair of bytes in a sequence with a single, unused byte, until a predefined +vocabulary size is reached. This is particularly useful for multilingual models, +where the vocabulary size would otherwise be too large to fit in memory. +The first embedding layer is typically a lookup table, which maps each word +to a unique index in a vocabulary, and each index to a vector of real numbers. +The embedding layer is typically followed by a recurrent, convolutional or +attention layer, which is used to capture the sequential nature of language. +Recurrent Neural Networks (RNNs) and recurrent architectures extended +to model long-range dependencies such as Long Short-Term Memory (LSTM) +and Gated Recurrent Unit (GRU) networks were the dominant architectures +for sequence modeling in NLP, yet they have been superseded by Transformers +in recent years. + + \ No newline at end of file diff --git a/assets/txts/pg_0051.txt b/assets/txts/pg_0051.txt new file mode 100644 index 0000000000000000000000000000000000000000..3de9aa17deb3cf903ad6925ca5c68ef5ef850e5e --- /dev/null +++ b/assets/txts/pg_0051.txt @@ -0,0 +1,58 @@ +STATISTICAL LEARNING + +2.1.3.3 + +19 + +Transformer Network + +A Transformer [473] is a sequence-to-sequence model that uses an attention +mechanism to capture long-range dependencies in the input sequence, benefiting +from increased parallelization. Traditionally, it consists of an encoder and a +decoder, each composed of multiple layers of self-attention and feed-forward +layers. +Attention is a mechanism that allows for soft selection of relevant information +from a set of candidates, e.g., tokens in a document, based on a query, e.g., +a token in the document. The scaled dot-product P +attention is defined +n +for a sequence of length n as follows: Att(Q, K, V ) = i=1 αi Vi . It utilizes +three learnable weight matrices, each multiplied with all token embeddings in a +sequence to build queries Q ∈ Rn×dq , keys K ∈ Rn×dq , and values V ∈ Rn×dv . +The output of the attention mechanism is a weighted sum of the unnormalized +values, where each attention weight of the i-th key is computed by normalizing +exp(QT +i Ki ) +the dot product between the query and key vectors αi = Pn exp(Q +T K ) . For +j=1 + +J + +j + +training stability, the dot product is typically scaled by the square root of the +dimensionality of the query and key vectors. This is followed by a feed-forward +layer to capture non-linear relationships between the tokens in the sequence. +There exist different forms of attention, depending on the type of relationship +that is captured. Self-attention computes the attention of each token w.r.t. +all other tokens in the sequence, which changes the representation of each token +based on the other tokens in the sequence. Multi-head attention is a set +of h attention layers, which every Transformer uses to concurrently capture +different types of relationships, concatenated together after the parallelized +processing. Cross-attention computes the attention of each token in one +sequence w.r.t. all tokens in another sequence, which is used in encoder-decoder +Transformer architectures for e.g., summarization and machine translation. +Specific to decoder layers, masked attention is used to prevent the decoder +from attending to future tokens in the sequence by masking the upper triangle +of the attention matrix calculation. +A major downside to Transformers is the quadratic complexity of the attention +mechanism (Figure 2.3), which makes them computationally inefficient for long +sequences. This has been addressed by a wealth of techniques [120], such as +sparsifing attention, targeting recurrence, downsampling, random or low-rank +approximations. +Position Embeddings are indispensable for Transformers to be able to process +sequences, as they do not have any notion of order or position of tokens in +a sequence. The most common type of position embedding is a sinusoidal + + \ No newline at end of file diff --git a/assets/txts/pg_0052.txt b/assets/txts/pg_0052.txt new file mode 100644 index 0000000000000000000000000000000000000000..95fde3416660d5dc9eb5031341f239b682ead4d3 --- /dev/null +++ b/assets/txts/pg_0052.txt @@ -0,0 +1,38 @@ +20 + +FUNDAMENTALS + +Quadratic complexity + +Figure 2.3. Illustration of the main attention mechanisms in a Transformer. + +embedding with a fixed frequency and phase, f (x) = sin(ωx + φ), where ω is the +frequency and φ is the phase which are learned as part of the training process, +and they are typically shared across all tokens in the sequence. Integrating +position information into Transformers can be achieved in different ways, which +[105, Table 1] gives an overview for. +Transformers have gradually taken over as an end-to-end architecture for both +NLP and CV tasks, albeit adoption in CV has been slower, due to the lack +of spatial invariance in the original Transformer architecture. This has been +addressed by recent works, such as Vision Transformer (ViT) [101], which uses +a patch-based input representation with position embeddings. +A large language model (LLM) consists of a stack of Transformers that is +pretrained on a large corpus of text, typically using a self-supervised learning +objective, such as predicting the next token in a sequence. The goal of LLMs +is to learn a general-purpose language representation that can be fine-tuned +to perform well on a wide range of downstream tasks. LLMs have disrupted +NLP in recent years, as they have achieved SOTA performance on a wide +range of tasks thanks to pretraining on large amounts of data. The most +popular LLMs are BERT [95], RoBERTa [287], ELECTRA [73], T5 [383], +GPT-3 [52], Llama-2 [452], and Mistral [199]. Next to challenges specific to +modeling document inputs, explained in Section 2.3.4, open challenges for +LLMs include: (i) structured output generation, (ii) domain-specific knowledge +injection (e.g., does retrieval-augmented generation (RAG) suffice? [253, 347]), +(iii) multimodality. +Vision-language models (VLM) are a recent development in multimodal +learning, which combine the power of LLMs with vision encoders to perform +tasks that require understanding both visual and textual information. The most +popular VLMs are CLIP [381], UNITER [70], FLAVA [423] and GPT-4 [344]. +In every chapter of this dissertation we have used Transformers, either as part + + \ No newline at end of file diff --git a/assets/txts/pg_0053.txt b/assets/txts/pg_0053.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a9a308e37ea6c8bae8d6d4688fd3f439f56bc99 --- /dev/null +++ b/assets/txts/pg_0053.txt @@ -0,0 +1,46 @@ +RELIABILITY AND ROBUSTNESS + +21 + +of a foundation model for DU tasks (Chapters 4 to 6) or to contrast with 1-D +CNNs in text classification (Chapter 3). Note that [265] share our concerns that +NLP needs a new ‘playground’ with more realistic tasks and benchmarks, which +extend beyond sentence-level contexts to more complex document-level tasks. +Alternative sub-quadratic architectures have started addressing Transformer’s +computational inefficiency on long sequences, e.g., Mamba [152] and Longnet +[99]. Time will tell if these will be able to compete with the Transformer’s +dominance in foundation models. + +2.2 + +Reliability and Robustness + +Chapter 3 contains a lot of relevant content on the basic relation between +uncertainty quantification, calibration, and distributional generalization or +detection tasks. Here, we will focus on the more general concepts of reliability +and robustness, and how they relate to concepts used throughout the rest of +the thesis. Next, we discuss the need for confidence estimation and appropriate +evaluation metrics, followed by short summaries of the main research trends in +calibration and uncertainty quantification. +Emerging guidance and regulations [2, 3, 475] place increasing importance on +the reliability and robustness of ML systems, particularly once they are used +in the public sphere or in safety-critical applications. In ML, reliability and +robustness are often used interchangeably [78, 420, 455], yet they are distinct +concepts, and it is important to understand the difference between them. This +thesis uses the following definitions of reliability and robustness, adapted from +systems engineering literature [395]: +Definition 3 [Reliability]. Reliability is the ability of a system to consistently +perform its intended function in a specific, known environment for a specific +period of time, with a specific level of expected accuracy [395]. Closer to the ML +context, this entails all evaluation under the i.i.d. assumption, allowing for some +benign shifts of the distribution, including predictive performance evaluation +with task-dependent metrics (accuracy, F1, perplexity, etc.), calibration, selective +prediction, uncertainty estimation, etc. +Reliability requires to clearly specify the role an ML component plays in a +larger system, and to define the expected behavior of the system as a function +of alignment with the training data distribution. This is particularly important +in the context of black-box models, where the inner workings of the model are +not transparent to the user. In this case, the user needs to be aware of the +model’s limitations, e.g., model misspecification, lack of training data, and the + + \ No newline at end of file diff --git a/assets/txts/pg_0054.txt b/assets/txts/pg_0054.txt new file mode 100644 index 0000000000000000000000000000000000000000..8311870e0107a75e565ce0ffd60775b808ca2926 --- /dev/null +++ b/assets/txts/pg_0054.txt @@ -0,0 +1,45 @@ +22 + +FUNDAMENTALS + +model needs to be able to communicate its own uncertainty to the user. This is +the focus of Chapter 3. +Definition 4 [Robustness]. Robustness is the ability of a system to maintain +its intended function despite a wide range of disturbances, with a minimal +degradation of performance [395]. Such disturbances can take the form of +adversarial attacks, distributional shifts, or other types of noise. In the ML +context, this entails all evaluation violating the i.i.d. assumption, including +adversarial and label noise robustness, out-of-distribution detection, domain +generalization, extrapolation, etc. +Robustness is more involved with the application scope in which a model can +perform well, assuming that the model can maintain some degree of its prediction +capacity on non-i.i.d. data which might be unknown at training time. Detecting +when the model is operating outside of its intended scope is an important part +of robustness to prevent failure propagation to downstream systems. +Resilience is another component of the R3 : reliability, robustness, resilience +concept in systems engineering, yet it is not a focus of this thesis, nor is it +a relevant qualifier of the ML model in isolation, as it is more related to the +system as a whole. Resilient systems are able to recover from disturbances, even +those caused by model misspecification, e.g., by adapting to new environments +and unexpected inputs from unknown distributions or by self-healing. + +2.2.1 + +Generalization and Adaptation + +To complete the R3 picture, we cannot overlook the generalizationadaptation spectrum, which has been less explored in our works, yet it is an +important part of current practices in ML. +Definition 5 [Generalization-adaptation]. Generalization is the ability of +a system to perform its intended function in a wide range of environments, +including those not known at design time [395]. Each environment is defined by +a data distribution over a domain and a task, and generalization is the ability +of a model to perform well on new data drawn from the same distribution. +Adaptation is the ability of a system to perform its intended function in a specific, +known environment, despite changes in the system itself or its environment +[395]. This entails the ability of a model to perform well on new data drawn +from a different distribution, which is known at design time. +Different settings of generalization-adaptation are: in-distribution (same +domain and task), domain generalization (same task, different domain), task +generalization (same domain, different task), out-of-distribution (different + + \ No newline at end of file diff --git a/assets/txts/pg_0055.txt b/assets/txts/pg_0055.txt new file mode 100644 index 0000000000000000000000000000000000000000..1abe0408da2c51ef0a6e8497de5a0ab28cd8a8ac --- /dev/null +++ b/assets/txts/pg_0055.txt @@ -0,0 +1,45 @@ +RELIABILITY AND ROBUSTNESS + +23 + +domain or task). If the model has access to limited samples for training +on the new distribution, it is referred to as few-shot learning or no samples at +all, zero-shot learning; if it is able to adapt to new distributions over time, or +accumulate knowledge over different tasks without retraining from scratch [87], +it is referred to as continual learning or incremental learning. +Many of these settings are referred to in business as out-of-the-box, self-learning, +yet without any formal definitions given. Domain and task generalization are +major selling points of pretrained LLMs, which are able to perform well on a +wide range of tasks and domains. In the case of very different distributions, e.g., +a different task/expected output or an additional domain/input modality, it is +often necessary to fine-tune the model on a small amount of data from the new +distribution, which is known as transfer learning. Specific to LLMs, instruction +tuning is a form of transfer learning, where samples from a new distribution are +appended with natural language instructions [69, 532]. This approach has been +used in Chapter 5 to adapt pretrained LLMs to the task of DocVQA, in an +effort to reduce the amount of annotated data required to generalize to unseen +domains and questions. + +2.2.2 + +Confidence Estimation + +A quintessential component of reliability and robustness requires a model to +estimate its own uncertainty, or inversely to translate model outputs into +probabilities or ‘confidence’ (Definition 6). +Definition 6 [Confidence Scoring Function]. Any function g : X → R +whose continuous output aims to separate a model’s failures from correct +predictions can be interpreted as a confidence scoring function (CSF) [193]. +Note that while it is preferable to have the output domain of g ∈ [0, 1] for easier +thresholding, this is not a strict requirement. +Circling back on the question of why one needs a CSF, there are multiple reasons: +i) ML models are continually improving, yet 0 test error is an illusion, even a +toy dataset (MNIST) is not perfectly separable; ii) once a model is deployed, +performance deterioration is expected due to i.i.d. assumptions breaking; iii) +generative models are prone to hallucinations [198], requiring some control +mechanisms and guardrails to guide them. +Below, we present some common CSFs used in practice [114, 172, 194, 539], +where for convenience the subscript is reused to denote the k-th element of the +output vector g(x) = gk (x). + + \ No newline at end of file diff --git a/assets/txts/pg_0056.txt b/assets/txts/pg_0056.txt new file mode 100644 index 0000000000000000000000000000000000000000..360f4c0f96484049d3e806db3df3c857839d25a8 --- /dev/null +++ b/assets/txts/pg_0056.txt @@ -0,0 +1,39 @@ +24 + +FUNDAMENTALS + +I. Maximum softmax probability (MSP): g(x) = maxy0 ∈Y fy0 (x) +II. Maximum logit: g(x) = maxy0 ∈Y zy0 (x), with logits z ∈ RK +P +III. Negative entropy: g(x) = − y0 ∈Y fy0 (x) log fy0 (x) +IV. Margin: g(x) = maxy0 ∈Y fy0 (x) − maxy00 ∈Y\y0 fy00 (x) +V. Distance-based measures +• kNN distance: A 1D outlier score derived from the average distance +of the feature representation of x to its k nearest neighbors in the +training distribution +• Mahalanobis distance [390]: The minimum distance of the feature +map (e.g., penultimate layer activations) of a test input to classconditional Gaussian distributions of the training data. +VI. Bayesian uncertainty estimation +Chapter 3 used MSP and negative entropy as CSFs, next to various PUQ +methods for Bayesian uncertainty estimation. Other chapters used MSP as it +is the most common CSF in practice, requiring only logits as input. From the +use of CSFs also follows the need to evaluate their statistical quality next to +task-specific predictive performance metrics, which is discussed next. + +2.2.3 + +Evaluation Metrics + +In an ideal world, the evaluation metric of interest would be the same as the loss +function used for training, yet this is rarely the case in practice, as the gradientbased optimization process requires a continuously differentiable function, while +the metric of interest is often non-differentiable, e.g., accuracy vs. cross-entropy +in classification. +Throughout our works, we have used (or extended) multiple predictive +performance, calibration, and robustness metrics, of which the most interesting +are respectively outlined. +Average Normalized Levenshtein Similarity (ANLS) is a metric introduced in [39] for the evaluation of VQA, which was then extended [449] to +support lists and be invariant to the order of provided answers. We adapted the +underlying Levenshtein Distance (LD) metric [251] to support not-answerable +questions, NA(G) = I[type(G) = not-answerable ] (see Equation (2.7)). + + \ No newline at end of file diff --git a/assets/txts/pg_0057.txt b/assets/txts/pg_0057.txt new file mode 100644 index 0000000000000000000000000000000000000000..710d4262ca8a1fb3793aba52f42a37bde4048ea5 --- /dev/null +++ b/assets/txts/pg_0057.txt @@ -0,0 +1,98 @@ +RELIABILITY AND ROBUSTNESS + +25 + +Consider for simplicity, the evaluation of a single non-list ground truth answer +G and prediction P̂ , each with string lengths |G| and |P̂ |, respectively. + +1 if NA(G) ∧ |P̂ | > 0, + + + + + +0 if NA(G) ∧ |P̂ | = 0, + + + + + |G| if |P̂ | = 0, +LD(G, P̂ ) = +LD(tail(G), tail(P̂ )) if G[0] = P̂ [0], + + + + +if G[0] 6= P̂ [0] (deletion), + LD(tail(G), P̂ ) + + + + +1 + min +LD(G, tail(P̂ )) +if G[0] 6= P̂ [0] (insertion), + + + + +LD(tail(G), tail(P̂ )) if G[0] 6= P̂ [0] (substitution) +(2.7) +Each of the conditions is tested in turn, and the first one that is true is executed. +The normalized similarity metric is then defined as +NLS(G, P̂ ) = + +1 − LD(G, P̂ ) +max(1, |G|, |P̂ |) + +. + +Given multiple ground truth answer variants G = {a1 , a2 , ...} and a predicted +answer for P̂Qi for each question Q in the test set of size N , we define the +complete metric as follows: +N  + + +1 X +ANLS = +max s a, P̂Qi +N i=1 a∈Gi + + + + + +s a, P̂Qi = + + + + + NLS a, P̂Q +i + 0 + + + +if NLS a, P̂Qi > τ + + +, +if NLS a, P̂Qi < τ + +(2.8) + +(2.9) + +where we follow prior literature [39, 449] in setting the threshold τ = 0.5. +In the case of a list-type question, Hungarian matching is performed following +[449] according to NLS between each ground truth answer part and each +prediction answer part. +Proper scoring rules [330] are used for generic evaluation of predictive +performance, which calculate scoring at the instance-level while measuring both +the quality of the predictive function and predicted probability distribution (as +they are not compatible with an arbitrary CSF): +• Negative Log Likelihood (NLL) [378] is both a popular loss function +(cross-entropy) and scoring rule which only penalizes (wrong) log +probabilities qi given to the true class, with I an indicator function defining + + \ No newline at end of file diff --git a/assets/txts/pg_0058.txt b/assets/txts/pg_0058.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcfc4dac5b1d5e2262610a1230c12f82c6435565 --- /dev/null +++ b/assets/txts/pg_0058.txt @@ -0,0 +1,62 @@ +26 + +FUNDAMENTALS + +the true class. This measure more heavily penalizes sharp probabilities, +which are close to the wrong edge or class by over/under-confidence. +`NLL (f ) = − + +N K +1 XX +I [yi = k] · log (fk (xi )) +N i=1 + +(2.10) + +k=1 + +• Brier Score [50] is a scoring rule that measures the accuracy of a +probabilistic classifier and is related to the mean-squared error (MSE) loss +function. Brier score is more commonly used in industrial practice since it +is an λ2 metric (score between 0 and 1), yet it penalizes tail probabilities +less severely than NLL. +`BS (f ) = + +N K +1 XX +2 +(I (yi = k) − fk (xi )) +N i=1 + +(2.11) + +k=1 + +All metrics following require a CSF g(x) to be defined, and can pertain to +specific evaluation settings [389] tested in Section 3.4.5. +Expected Calibration Error (ECE) [156, 332] is a default metric to evaluate +top-1 prediction miscalibration. A calibration estimator (Definition 7) measures +the Lp norm difference between a model’s posterior and the true likelihood of +being correct. +Definition 7 (Lp Calibration Error). [231, 463] +The Lp calibration error of f : X → ∆Y over the joint distribution (X × Y ) +with the Lp norm p ∈ [1, ∞) is given by: + + +CEp (f )p = E(X,Y ) kE[Y | f (X)] − f (X)kpp +(2.12) +The popular ECE metric [332] with condition I[Y = ŷ] is a special case of the +above with p = 1, where the expectation is approximated using a histogram. +MaxCE defines the worst-case risk version with p = ∞, effectively reporting on +the bin with the highest error. As part of Chapter 5, we contributed a novel +empirical estimator of top-1 calibration for the task of VQA, where the exact +accuracy condition I[Y = ŷ] in ECEis replaced by I[ANLS(y, ŷ) > τ ]. Prior +work [329] used a similar strategy of thresholding continuous quality scores to +be able to estimate ECE. +In practice, ECE is implemented as a histogram binning estimator that +discretizes predicted probabilities into ranges of possible values for which +conditional expectation can be estimated. Concretely, the probability space +is partitioned into B bins bi with i ∈ {1, ..., B}, where for each bin bi the gap +between observed accuracy and bin confidence P¯b is measured, with a final + + \ No newline at end of file diff --git a/assets/txts/pg_0059.txt b/assets/txts/pg_0059.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8e87804a4ad73a4a219ecb87bf6d4d9a5b0867d --- /dev/null +++ b/assets/txts/pg_0059.txt @@ -0,0 +1,64 @@ +RELIABILITY AND ROBUSTNESS + +27 + +average weighted by the number of samples per bin |bi |. +ECE = + +B +X +|bi | +i=1 + +N + +acc(bi ) − P¯b (bi ) + +(2.13) + +To minimize the drawbacks inherited from histogram binning, as suggested +by the literature [231, 342, 393, √ +463], we have applied an equal-mass binning +scheme with 100 bins (close to N ). While plenty of histogram-based ECE +estimator implementations exist, many design hyperparameters are not reported +or exposed: +I. +II. +III. +IV. +V. + +`p norm +The number of bins (beyond the unfounded default of |B| = 15) +Different binning schemes (equal-range, equal-mass) +Binning range to define the operating zone +Proxy used as bin accuracy (lower-e.g., center, upper-edge) + +We upstreamed 1 a generic implementation of binning-based ECE as part of +the ICDAR 2023 DUDE competition (Chapter 5). +Alternative formulations have been developed for multi-class [342, 370, 492] +and multi-label calibration [493, 520]. Measurements of “strong” calibration, +over the full predicted vector instead of the winning class, are reported less in +practice. Possible reasons are that they render class-wise scorings, either based +on adaptive thresholds or require estimation of kernel-based calibration error +to derive hypothesis tests. While we are mindful of alternatives (revisited in +Section 2.2.4), we have found that the simpler “weak” calibration measured by +ECE meets the practical requirements for most of our benchmarking. +Area-Under-Risk-Coverage-Curve (AURC) [138, 193] measures the possible trade-offs between coverage (proportion of test set%) and risk (error % +under given coverage). The metric explicitly assesses i.i.d. failure detection +performance as desired for safe deployment. It has advantages as a primary +evaluation metric given that it is effective both when underlying prediction +models are the same or different (as opposed to AUROC or AUPR). Its most +general form (without any curve approximation), with a task-specific evaluation +metric ` and CSF g, is defined as: + + +E(x̃,ỹ)∼PXY [`([f (x̃)], ỹ)I[g(x̃) > g(x)]] +AURC(f, g) = Ex∼P(X) +(2.14) +Ex̃∼PX [I[g(x̃) > g(x)]] +This captures the intuition that the CSF g should be able to rank instances by +their risk, and that the risk should be low for instances with high confidence. +1 https://huggingface.co/spaces/jordyvl/ece + + \ No newline at end of file diff --git a/assets/txts/pg_0060.txt b/assets/txts/pg_0060.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9e43dc8376e2bc346aa1c72fff9d518c2648cbc --- /dev/null +++ b/assets/txts/pg_0060.txt @@ -0,0 +1,53 @@ +28 + +FUNDAMENTALS + +The standard curve metric can be obtained by sorting all CSF estimates and +P +T P +F P +evaluating risk ( T PF+F +P ) and coverage ( T P +F P +F N +T N ) for each threshold t (P +if above threshold) from high to low, together with their respective correctness (T +if correct). This is normally based on exact match, yet for generative evaluation +in Section 5.3.5, we have applied ANLS thresholding instead. Formulated +this way, the best possible AURC is constrained by the model’s test error +(1-ANLS) and the number of test instances. AURC might be more sensible for +evaluating in a high-accuracy regime (e.g., 95% accuracy), where risk can be +better controlled and error tolerance is an apriori system-level decision [115]. +This metric was used in every chapter of Part II. +For the evaluation under distribution shift in Chapter 3, we have used binary +classification metrics following [172], Area Under the Receiver Operating +Characteristic Curve (AUROC) and Area Under the Precision-Recall +Curve (AUPR), which are threshold-independent measures that summarize +detection statistics of positive (out-of-distribution) versus negative (indistribution) instances. In this setting, AUROC corresponds to the probability +that a randomly chosen out-of-distribution sample is assigned a higher confidence +score than a randomly chosen in-distribution sample. AUPR is more informative +under class imbalance. + +2.2.4 + +Calibration + +The study of calibration originated in the meteorology and statistics literature, +primarily in the context of proper loss functions [330] for evaluating +probabilistic forecasts. Calibration promises i) interpretability, ii) system +integration, iii) active learning, and iv) improved accuracy. A calibrated model, +as defined in Definition 8, can be interpreted as a probabilistic model, which can +be integrated into a larger system, and can guide active learning with potentially +fewer samples. Research into calibration regained popularity after repeated +empirical observations of overconfidence in DNNs [156, 339]. +Definition 8 (Perfect calibration). [86, 88, 520] Calibration is a property of +an empirical predictor f , which states that on finite-sample data it converges +to a solution where the confidence scoring function reflects the probability ρ of +being correct. Perfect calibration, CE(f ) = 0, is satisfied iff: +P(Y = Ŷ | f (X) = ρ) = ρ, + +∀ρ ∈ [0, 1] + +(2.15) + +Below, we characterize calibration research in two directions: (A) CSF evaluation +with both theoretical guarantees and practical estimation methodologies +• Estimators for calibration notions beyond top-1 [229, 231, 342, 463] + + \ No newline at end of file diff --git a/assets/txts/pg_0061.txt b/assets/txts/pg_0061.txt new file mode 100644 index 0000000000000000000000000000000000000000..62db2eb7e2431469b8fd75b7a5d447512fc527b9 --- /dev/null +++ b/assets/txts/pg_0061.txt @@ -0,0 +1,42 @@ +RELIABILITY AND ROBUSTNESS + +29 + +• Theoretical frameworks to generalize over existing metrics and design +novel metrics [43, 231, 492, 493] +• Specialize towards a task such as multi-class classification [463], regression +[228, 428], or structured prediction [227] +• Alternative error estimation procedures, based on histogram regression +[156, 331, 332, 340, 343], kernels [230, 370, 492, 493] or splines [159] +(B) Calibration methods for improving the reliability of a model by adapting +the CSF or inducing calibration during training of f : +• Learn a post-hoc forecaster F : f (X) → [0, 1] on top of f (overview: [298]) +• Modify the training procedure with regularization (overview: [277, 370]) +Due to its importance in practice, we will provide more detail on train-time +calibration methods. It has been shown for a broad class of loss functions +that risk minimization leads to Fisher consistent, Bayes optimal classifiers in +the asymptotic limit [25, 495]. These can be shown to decompose into a sum +of multiple metrics including both accuracy and calibration error [144, 177]. +However, there is no –finite data, nor asymptotic– guarantee that classifiers +trained with proper loss functions containing an explicit calibration term +will eventually be well-calibrated. In practice, being entangled with other +optimization terms often leads to sub-optimal calibration. For this reason, +recent studies [12, 230, 492] have derived trainable estimators of calibration +to have a better handle (γ > 0) on penalizing miscalibration, i.e., by jointly +optimizing risk (R(f ) = EX,Y [` (Y, f (X))]) and parameterized calibration error +(CE) as in Equation (2.16). +fˆ = arg min (R(f ) + γ CE(f )) +f ∈F + +(2.16) + +Many of these methods are implicitly or explicitly maximizing entropy of +predictions or entropy relative to another probability distribution, e.g., Entropy +Regularization [361], Label Smoothing (LS) [327], Focal Loss [324], Marginbased LS [277], next to more direct (differentiable), kernel-based calibration +error estimation [211, 230, 370, 492, 493, 526]. We had expected community +contribution on the DUDE competition (Chapter 5) to take advantage of this +wealth of calibration methods, yet the majority of submissions used uncalibrated +models with MSP, requiring more education on the importance of calibration +in practice. + + \ No newline at end of file diff --git a/assets/txts/pg_0062.txt b/assets/txts/pg_0062.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d8b1f8da63e4accbb0f8da2ec7c8ee7a9aa0f58 --- /dev/null +++ b/assets/txts/pg_0062.txt @@ -0,0 +1,50 @@ +30 + +FUNDAMENTALS + +For the sake of completeness, there exist different notions of calibration, differing +in the subset of predictions considered over ∆Y [463]: +I. top-1 [156] +II. top-r [159] +III. canonical calibration [51] +Formally, a classifier f is said to be canonically calibrated iff, +P(Y = yk | f (X) = ρ) = ρk + +∀k ∈ [K] ∧ ∀ρ ∈ [0, 1]K where K = |Y|. (2.17) + +However, the most strict notion of calibration becomes infeasible to compute +once the output space cardinality exceeds a certain size [157]. +For discrete target spaces with a large number of classes, there is plenty interest +in knowing that a model is calibrated on less likely predictions as well. Some +relaxed notions of calibration have been proposed, which are more feasible +to compute and can be used to compare models on a more equal footing. +These include: top-label [157], top-r [159], within-top-r [159], marginal +[229, 231, 342, 492]. + +2.2.5 + +Predictive Uncertainty Quantification + +Bayes’ theorem [26] is a fundamental result in probability theory, which +provides a principled way to update beliefs about an event given new evidence. +Bayesian Deep Learning (BDL) methods build on these solid mathematical +foundations and promise reliable predictive uncertainty quantification (PUQ) +[124, 136, 140, 238, 301, 325, 326, 464, 466, 496]. +The Bayesian approach consists of casting learning and prediction as an +inference task about hypotheses (uncertain quantities, with θ representing +all BNN parameters: weights w, biases b, and model structure) from training +N +data (measurable quantities, D = {(xi , yi )}i=1 = (X, Y )). +Bayesian Neural Networks (BNN) are in theory able to avoid the pitfalls +of stochastic non-convex optimization on non-linear tunable functions with +many high-dimensional parameters [300]. More specifically, BNNs can capture +the uncertainty in the NN parameters by learning a distribution over them, +rather than a single point estimate. This offers advantages in terms of data +efficiency, avoiding overfitting thanks to regularization from parameter priors, +model complexity control, and robustness to noise due to the probabilistic +nature. However, they come with their own challenges such as the increased +computational cost of learning and inference, the difficulty of specifying +appropriate weight or function priors, and the need for specialized training +algorithms or architectural extensions. + + \ No newline at end of file diff --git a/assets/txts/pg_0063.txt b/assets/txts/pg_0063.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fff5d20cef344c8cbd5669f94286869ba034de4 --- /dev/null +++ b/assets/txts/pg_0063.txt @@ -0,0 +1,67 @@ +RELIABILITY AND ROBUSTNESS + +31 + +For a fixed model m, the analytically intractable Bayesian posterior distribution +of the parameters θ is given by Bayes’ rule: +P (D | θ) +P (θ | D) = + +P (D | θ)P (θ | m) +P (D | m) + +P (θ) +P (θ | D) + +likelihood of θ (in model m) +prior probability of θ + +(2.18) + +posterior of θ given data D + +The denominator P (D|m) is intractable, since it requires integrating over all +possible parameter values weighted by their probabilities. This is known as +the inference problem, which is the main challenge in BDL, as the posterior +distribution is required to compute the predictive distribution for any new input +(Equation (3.1) further explains this). +In practice, BNNs are often implemented as Variational Inference (VI) +methods, which approximate the high-dimensional posterior distribution with a +tractable distribution family, such as a Gaussian distribution [46]. Let p(θ | D) +be the intractable posterior distribution of parameters θ given observed data D, +which will be approximated with a simpler, conjugate distribution q(θ|D; φ), +parameterized by φ (e.g., mean and variance). +The key idea consists of finding the optimal variational parameters φ∗ that +minimize the Kullback–Leibler (KL) divergence between the approximating +distribution q(θ|D; φ) and the replaced true posterior p(θ | D). This is achieved +by maximizing the evidence lower bound (ELBO), given by: + +ELBO(φ) = Eq(θ|D;φ) [log p(D|θ)] − KL[q(θ|D; φ)||p(θ)] +Z + +(2.19) + +p(D|θ)p(θ) +dθ +(2.20) +q(θ|D; φ) +Z +Z +q(θ|D; φ) += q(θ|D; φ) log p(D|θ)dθ − q(θ|D; φ) log +dθ, (2.21) +p(θ) + += + +q(θ|D; φ) log + +where the first term Equation (2.21) represents the expected likelihood of the +data given the parameters, and the second term quantifies the dissimilarity +between the variational distribution and the prior distribution over the +parameters. Maximizing the ELBO with φ is equivalent to minimizing the KL +divergence between q(θ|D; φ) and p(θ|D), thereby providing a lower bound on the +log marginal likelihood log p(D) ≥ ELBO(φ), after the parameters θ have been +integrated out. By optimizing the variational parameters φ, we simultaneously + + \ No newline at end of file diff --git a/assets/txts/pg_0064.txt b/assets/txts/pg_0064.txt new file mode 100644 index 0000000000000000000000000000000000000000..01fd9713c535f08dac973523a57d7a963aef800e --- /dev/null +++ b/assets/txts/pg_0064.txt @@ -0,0 +1,45 @@ +32 + +FUNDAMENTALS + +fit the model to the data well and ensure that the approximate posterior is +encouraged to be as close as possible to the true posterior distribution. +Even a non-Bayesian, classic NN can be interpreted in this framework as an +approximate, degenerate posterior distribution, i.e., a Dirac delta function +centered on the MAP estimate of the parameters, q(θ|D; φ) = δ(θ − θ̂MAP ). +More PUQ methods based on different posterior approximations are discussed +in detail in Chapter 3, with additional updates on the state-of-the-art. + +2.2.6 + +Failure Prediction + +Based on the principle of selective prediction [138, 139], failure prediction is +the task of predicting whether a model will fail on a given input. In every chapter +following Chapter 3, this topic is addressed in the context of the respective +task. Since it is an important topic in the context of IA-DU that is generating +increasing interest [81, 114, 127, 193, 391], it warrants a brief overview of +how it provides a unified perspective. We refer the reader to [171, 536] for a +comprehensive survey. +Failure prediction subsumes many related tasks in the sense that it requires +a failure source to be defined to form a binary classification task. The failure +source can be i.i.d. mispredictions, covariate shifts (e.g., input corruptions, +concept drift, domain shift), a new class, domain, modality, task, or concept. +The goal of failure prediction is to predict these failures before they occur, +allowing for more reliable and robust ML systems. +First, note that calibration does not imply failure prediction, as a calibrated +model w.r.t. i.i.d. data can still be overconfident on OOD inputs [549]. The +example in Example 2.2.1 sketches the independent requirements of calibration +and confidence ranking. +Example 2.2.1. Classifier A scores 90% accuracy on the test set, with a CSF +using the entire range [0, 1]. Classifier B scores 92% accuracy on the test set, +but the CSF always reports 0.92 for any input. Which classifier is preferred in +a real-world setting? +• Classifier A is calibrated, but it is not possible to know whether it will +fail on a given input. +• Classifier B might be less calibrated, but the CSF allows separability to +predict failure on a given input. +Specific to OOD failure prediction, [527] provides a comprehensive categorization +of failure tasks and methods. + + \ No newline at end of file diff --git a/assets/txts/pg_0065.txt b/assets/txts/pg_0065.txt new file mode 100644 index 0000000000000000000000000000000000000000..2db7ae7395902cbed3e0e287bac4cc0be927e75c --- /dev/null +++ b/assets/txts/pg_0065.txt @@ -0,0 +1,46 @@ +DOCUMENT UNDERSTANDING + +2.3 + +33 + +Document Understanding + +This Section focuses on the history and definition of DU as a field of AI. +Like all subfields of AI, DU has been evolving rapidly, and the definition of a +document has been changing accordingly. We identify three main stages in the +evolution of the field, dependent on a) the type of learning, b) the unit of study, +and c) the modality of the input. +Regarding a), it has followed the natural evolution of rule-based systems, to +learning-based systems, to deep learning systems to build representations of +documents. Regarding b), the field has evolved from region-based analysis, to +page-level analysis, and now moving to document-level analysis, as we have +advocated in our research (Chapters 4 and 5). Regarding c), the field was +originally dominated by OCR, particularly CV, then by KIE, emphasizing NLP, +and now by both CV and NLP, with more attention given to multimodality and +generative models by which new tasks can be approached, e.g., DocEdit [311]. +Below, we expound on the evolution of the field through the lens of each +modality, and the tasks that are typically associated with it. We also provide +an overview of the most popular datasets and models in each task/modality. +The term Document Understanding (DU) is used in a variety of contexts +(historical, research, commercial), and its definition deserves some attention. A +seminal reference [430] dates back to 1992, which defines DU as ‘the study of all +processes involved in taking a document through various representations’: from +a physical object to a digital image, from an image to a symbolic description, +and from a symbolic description to a high-level semantic representation. At the +time, the field was dominated by Optical Character Recognition (OCR), +particularly CV, and the definition was focused on the physical-to-digital +conversion of documents, excluding born-digital documents. +Furthermore, the subterm document is used in the context of NLP (in +particular in summarization) to denote a textually-rich document: a sequence of +words exceeding a sentence or paragraph or a single unit in a corpus. However, in +DU it denotes a visually-rich document (VRD), which can be a combination +of text, images, tables, and other elements. There is no universally established +definition of a document [53], and it is used interchangeably with the term +page, which is a physical, symbolic unit. In Chapter 4, we come back to this +definition, addressing the misalignment of research with how documents occur +in practice. +Over time, the quality of OCR has improved, and the focus of the field has shifted +from OCR to document image classification (DIC) and key information + + \ No newline at end of file diff --git a/assets/txts/pg_0066.txt b/assets/txts/pg_0066.txt new file mode 100644 index 0000000000000000000000000000000000000000..752821efcb4a90eb2db332ecac04d29ec01189d7 --- /dev/null +++ b/assets/txts/pg_0066.txt @@ -0,0 +1,62 @@ +34 + +FUNDAMENTALS + +OCR + +DIC +document type: invoice +KIE +document number: 29069 +document date: 12/21/2020 +DocVQA +How much should be paid? $459.90 + +f Invoice gro groSolar ee et 601 Old River Road, Suite 3 ome White +River Junction, VT 05001 (B00) 374-4494 Hagan, Tilden ; John +Hagan SolarDyne, LLC Reidsville NC 27320 United States +created fram: PO # Terms Due Date Ship Via Memo Sales Order +#16705 SMA Sensor Box Net 30 1/20/2011 Best Way... SMA Sensor +Box SMA Ambient SMA Ambient Temperature Sensor, for 1 37.38 +o 37.38 Temp Sensor use with Sensor Box . SMA Sunny SMA +Sunny Sensor Box 1 370.86 o 370.86 Sensor Box Terms subject to +published terms and conditions provided previously. Payment is +due on terms noted above. Customer is responsible for full cost, +including interest and cost of collections should it be necessary. +Additional shipping casts may apply. 408.24 Thank you for your +business. : 9.51 31.64 $449.39 4 dv —— wd te ss Tie, wy Ss NSS ) / +WV Re * LL Rel TTOWS- OB613 groSolar Remittance Slip +[Customer —————~*dXi owe _[Amountbue [AmountPad Make +checks payable to: proSalar PO Box 6144 Brattleboro, VT +05302-6l4d + +DOD: +logo: (136,313; 313,432) +handwriting: (493,2133; 2063,2523) + +Figure 2.4. A simple illustration of common DU tasks on an example document. + +‘’ +extraction (KIE), which are more application-directed recognition tasks. +Arguably, most businesses are interested in the unstructured information +contained in documents, rather than the documents themselves. On the +commercial side, the combination of these tasks is often referred to as +Intelligent Document Processing (IDP), albeit ‘understanding’ has been +similarly marketed by e.g., UIPath (originally an RPA company, now looking +at AI as the next frontier of automation). The scientific community has been +more careful in using the denomination ‘understanding’ [29], with the DUE +benchmark [47] defining it, on the one hand, as an end-to-end process involving +a subset of human cognitive skills, and on the other hand, enumeratively with +several well-defined problems (OCR, KIE, VQA as defined in Section 2.3.1). +In our research, we have extended DU to denote ‘the ability to holistically +consume textual and visual elements structured according to rich semantic +layouts, and reason over compositional information extracted from a VRD to +generate meaningful insights or actions.’. There is no specific notion of tasks, +but rather an emphasis on the end-to-end process leveraging all modalities +intrinsic to documents, where a generic DU model is expected to generalize +to any task on any document from any domain. This stands in shrill contrast +to only DIC and KIE, where local context generalization (key-value pairs) is +rewarded, whereas DU as defined here aims to generalize beyond the local +context of a document. + + \ No newline at end of file diff --git a/assets/txts/pg_0067.txt b/assets/txts/pg_0067.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc14cf1cb8b12d804f110bcf1063380db1c555b5 --- /dev/null +++ b/assets/txts/pg_0067.txt @@ -0,0 +1,50 @@ +DOCUMENT UNDERSTANDING + +2.3.1 + +35 + +Task Definitions + +For thorough understanding, each task will be defined in terms of the following +components: input, output, model, and evaluation. Most tasks use a single +document page as input (for both legacy and computational reasons), and the +output depends on the task. +Formally, a page p consists of an image v ∈ RC×H×W (number of channels, +T +height, and width, respectively) with T word tokens u = {wt }t=1 , where wt +maps to (sub)words in a vocabulary V, organized according to a layout structure + + T +s = x1t , yt1 , x2t , yt2 t=1 , typically referred to as token bounding boxes (top-left +to bottom-right corner), coming from OCR or available from a born-digital +document. Standardized notation for document inputs beyond a single page +has been established in Chapter 4 [470]. +Optical Character Recognition (OCR) is the task of converting a document +image to a sequence of characters. The input is a document image, and the +output is a sequence of characters. The output space Y is the set of all possible +characters (e.g., a, b, c, ..., A, B, C, ...), typically restricted to a subset of +characters based on the document language and orthography. The quality is +evaluated with a metric such as the word error rate (WER) or the character +error rate (CER). +Document Classification (DC) is the task of assigning a document to a +predefined class. The input is a document image, and the output is a class +label. The output space Y is the set of all document classes (e.g., invoice, email, +form, advertisement). Standard metrics are accuracy and F1 score (if class +imbalance). +Key Information Extraction (KIE) is the task of extracting key information +from a document. The input is a document image, and the output is a set +of key-value pairs. The output space Y is the set of all key-value pairs (e.g., +date: 2024-01-01, total: 1000.00, ...), where keys are pre-defined as part of a +format relevant to the document class in scope. In practice, it is implemented +as sequence labeling with y = {y1 , y2 , ..., yT }, where yt ∈ Y is a label from a +IOB,IOBES-encoded labelset Y ( B-DATE, I-DATE, ..., O). Extraction quality +is evaluated with the sequence F1 score to account for the imbalance with the +‘O’ token. +Document Visual Question Answering (DocVQA) is the task of answering +a question about a document. The input is a document image and a question, +and the output is an answer. Depending on the type of question, the output +space changes. Extractive questions (ExQA) require a subspan of the document’s +text as answer, y = (ystart , yend ) with ystart ≤ yend and ystart , yend ∈ {1, ..., T }. + + \ No newline at end of file diff --git a/assets/txts/pg_0068.txt b/assets/txts/pg_0068.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6573c682990444ae89e6237901770e3dc39adc1 --- /dev/null +++ b/assets/txts/pg_0068.txt @@ -0,0 +1,47 @@ +36 + +FUNDAMENTALS + +Abstractive questions (AbsQA) require a sequence of tokens as answer, y = +{y1 , y2 , ..., yT 0 } with yt ∈ V. The latter is more complex to evaluate, yet more +interesting to test ’understanding’ than restricting evaluation to answer spans, +which is why we introduced AbsQA as part of Chapter 5. Orthogonal to the +previous two types, DUDE introduces list questions with multiple or multi-span +(ExQA) answers. Predicted answers are evaluated using ANLS, with multiple +extensions defined in Section 2.2.3. +Document Layout Analysis (DLA) is the task of analyzing the layout of a +document in terms of logical layout elements (e.g., text blocks, headers, figures, +figure, plots, tables, text). The input is a document image, and the output is +a set of bounding boxes and their respective labels. The output space Y is +the set of all possible bounding boxes and labels. More formally, it outputs +a set of tuples, where each tuple (bj , cj ) represents one of J detected logical +layout elements. For each, bj denotes the bounding box for the j-th detected +element, defined as (xj , yj , wj , hj ) (in the popular COCO format). cj is the class +label for the j-th element, indicating its object category. Evaluation is done +with the standard COCO metrics, i.e., average precision (AP) over different +intersection-over-union (IoU) thresholds, and mean AP (mAP). +Document Generation (DG) is the task of generating a document from a set +of key-value pairs and potential metadata attributes, e.g., visual appearance, +color scheme. The output space Y is the set of all possible document images, +which makes it hard to evaluate in a quantitative manner. Some efforts have +been made to define metrics for document generation, e.g., Document Earth +Mover’s Distance [169], but they are not yet widely adopted. +Other lesser known tasks include document object detection (DOD), table +structure recognition (TSR), document retrieval, document editing, document +translation, document summarization, document authenticity verification With +the rise of multimodal models, more data types are being considered jointly +with documents under the umbrella term visually-situated language, such as +charts, tables, handwriting, text-heavy scenes or illustrations, webpage and user +interface screenshots etc. + +2.3.2 + +Datasets + +With the variety of tasks, there is a large number of datasets available for each +DU task. Instead of exhaustively enumerating datasets for each task defined +above, we will link to the tables in the respective chapters treating these tasks. +We will only highlight some more recent datasets, which are not yet included in +the tables. + + \ No newline at end of file diff --git a/assets/txts/pg_0069.txt b/assets/txts/pg_0069.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b45ce2ed74b523a59e75a774b256ce90bc38f39 --- /dev/null +++ b/assets/txts/pg_0069.txt @@ -0,0 +1,45 @@ +DOCUMENT UNDERSTANDING + +37 + +An overview of document source datasets for pretraining or dataset construction +is presented in Table 4.1 as part of Chapter 4. +For an overview of DC datasets, see Table 4.2 in the same chapter. For an +overview of KIE datasets, we refer to [47], with some newer datasets [422, 485] +linked here. An overview of DocVQA datasets is presented in Table 5.1, with +the introduction of the DUDE dataset (Chapter 5). An interesting new addition +is PDFTriage [400] which focuses more on retrieval than on QA. Finally, some +datasets for DLA are presented in Table 6.1 as part of Chapter 6. Other essential +datasets are PubLayNet [544] and DocBank [261]; and the novel multidomain +M 6 dataset [71]. + +2.3.3 + +Models + +A model taxonomy is presented in [407] that differentiates models based on the +input modalities they use, the geometric approach, dependence on OCR, or +the type of output they produce. However, it is far from comprehensive due to +missing out on various DU tasks and more recent models. Table 2.2 presents +an overview of models that we have applied to various DU tasks, extending the +taxonomy with our observations. +Depending on the modalities considered and the requirements of the task, +different pretrained models have been used in practice, instead of the document +foundation models presented above. +For document text, the most popular models are BERT [95], RoBERTa [287], +and T5 [383]. Additionally, text-only LLMs such as GPT-3 [52], Llama [452], +and Mistral [199] are increasingly applied to document text. +For document images, the most popular models are ResNet [167], EfficientNet +[439], and DiT [259]. +For all modalities combined, the most popular models are the LayoutLM series +[187, 502, 503], DocFormer(v2) [15, 16], and UDOP [443]. The former are OCRbased pipelines, with pixel-only models such as Donut [216] and Pix2Struct +[247] gaining popularity for increased efficiency, albeit they are still catching +up on performance. Alternative approaches include the use of graph neural +networks [286, 341, 517] and grid-based models [212, 275], yet their performance +lags behind the aforementioned sequence models. +Most of the above-mentioned models have been applied during the Chapter 5 +benchmark experiments, with only results missing for multimodal LLMs, which +were introduced after the publications of the chapter. An up-to-date overview of +newer multimodal LLMs, e.g., GIT2, PaLi, Flamingo, Kosmos-2, GPT-4, Fuyu, + + \ No newline at end of file diff --git a/assets/txts/pg_0070.txt b/assets/txts/pg_0070.txt new file mode 100644 index 0000000000000000000000000000000000000000..ade6cfd953637b18afa640aee3ac36b7fe39804b --- /dev/null +++ b/assets/txts/pg_0070.txt @@ -0,0 +1,202 @@ +38 + +FUNDAMENTALS + +Model +LayoutLMv1 [502] +DocStruct [484] +StrucText [266] +StructuralLM [254] +LayoutLMv2 [503] +SelfDoc [263] +LamBERT [134] +TILT [371] +DocFormerv1 [15] +UniDoc [153] +DiT [259] +LayoutLMv3 [187] +BROS [181] +XYLayoutLM [154] +FormNet [245] +ERNIE-Layout [264] +LiLT [481] +XDoc [66] +GeoLayoutLM [296] +Vision Grid Transformer [80] +DocFormerv2 [16] +Donut [216] +Pix2Struct [247] +UDOP [443] +Hi-VT5 [451] +FormNetv2 [246] +LayoutMask [458] +UReader [510] +DocLLM [480] +Gramformer [44] +InstructDoc [442] + +Year +2020 +2020 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2023 +2023 +2023 +2022 +2023 +2023 +2023 +2023 +2023 +2023 +2024 +2024 +2024 + +Conf. +KDD +EMNLP +ACM +ACL +ACL +CVPR +ICDAR +ICDAR +ICCV +NeurIPS +ACM +ACM +AAAI +CVPR +ACL +EMNLP +ACL +EMNLP +CVPR +ICCV +ECCV +ICML +CVPR +PatRecog +ACL +ACL +ACL +- + +Arch. +E +E +E +E +E +E +E +E+D +E +E +E +E +E +E +E +E +E +E +E +E +E+D +E+D +E+D +E+D +E+D +E +E +D +D +E+D +E+D + +Input Mod. +T+S +T+V+S +T+V+S +T+S +T+V+S +T+S +T+V+S +T+V+S +T+V+S +V +T+V+S +T+S +T+V+S +T+V+S +T+S +T +T+V+S +T+V+S +T+V+S +V +V +T+V+S +T+V+S +T+V+S +T+S +V+S +T+S +T+V+S +T+V+S + +Vision Branch +Resnet50 +Resnet50 + FPN +ResNeXt 101 +U-Net +Resnet50 +Resnet50 +ViT +Linear +ResNeXt 101 +F-RCNN +F-RCNN+ConvNeXt +ViT +Linear +SwinTransformer +ViT+variableres +ResNeXt 101 +ViT +3-layer CNN +CLIP-ViT +Linear +CLIP-ViT + +Table 2.2. Adapted from [16]. A summary of DU prior art is presented with their +architecture (E: Encoder, D: Decoder), the input (T: text, V: vision, S: spatial features), +the vision features branch and core extensions. + +Llava, CogVLM, that could potentially be applied to DU tasks is presented in +[512]. + +2.3.4 + +Challenges in Document Understanding + +To tease the contributions of our works, we will highlight some of the most +important challenges in DU, which are shared by all chapters in this thesis. + + \ No newline at end of file diff --git a/assets/txts/pg_0071.txt b/assets/txts/pg_0071.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a6776242fb0e896ea4ca488fc0dfbb504ccea35 --- /dev/null +++ b/assets/txts/pg_0071.txt @@ -0,0 +1,103 @@ +DOCUMENT UNDERSTANDING + +2.3.4.1 + +39 + +Long-Context Modeling + +An important challenge for most SOTA DU models based on the Transformer +architecture is long document processing, which is not yet solved satisfactorily, +as it is the focus of Chapters 4 and 5. +We illustrate the extent of the problem with the most popular DU model, +LayoutLMv3 [187]2 in Figure 2.5, pointing to the quadratic complexity of +attention, which cannot be parallelized over pages with encoder-only models. +Hi-VT5 [451] is the only model that is by design usable for multipage documents, +yet it requires a lot of memory and depends on compressing page information +into learnable embeddings. +xT +d + +Text Embedding + +xS + +|V| Embeddings + +xL + +OCR_0...M + +S Token Embeddings + +Token + +Layout Embedding + +Bbox + +OCR + +1D Position Embeddings + +Transformer Block + +Feed Forward + +Add & Norm +Dropout + +[CLS] pooling + +Feed Forward + +Add & Norm +Dropout + +Image Embedding +Patching + +Img_0...I + +Linearize + +Dropout +Softmax + +2D Position Embeddings + +Multi-​ +Head +Attention + +H x A + +2 + +(T+M) + +Patch Embeddings + +xM + +Figure 2.5. Inefficiency of document foundation models for processing multipage +documents, illustrated with LayoutLMv3 [187]. Notation: L pages, T text tokens, M +linearized visual patches, S Transformer layers + +While a page is the modeling unit of preference to maintain computational +efficiency in Transformers’ processing sequences of tokens, it is not the natural +appearance of a document. Some tasks require the global document context +and treating each page contextually independent is suboptimal, as argued in +our works on multipage document classification (Chapter 4) and DocVQA +(Chapter 5) with multi-hop question answering. +Figure 2.6 illustrates how a prototypical multimodal architecture, Hi-VT5 [451], +is used for the task of multipage ExVQA. +In principle, every LLM can perform multipage document processing depending +on the ability of the LLM to extrapolate to longer context windows, given +the position representation method (barring absolute positional encodings), +and performance relying on also having trained on long sequences, e.g., by +2 >8.6M + +model weights downloads in January 2024 + + \ No newline at end of file diff --git a/assets/txts/pg_0072.txt b/assets/txts/pg_0072.txt new file mode 100644 index 0000000000000000000000000000000000000000..b64a49d6331dcf5aa1a27de3c9cb156e2209e8a1 --- /dev/null +++ b/assets/txts/pg_0072.txt @@ -0,0 +1,85 @@ +40 + +FUNDAMENTALS + +$8,834.17 +Quantity + +Extractive + +Answer type +module + +Question type +module + +Answer Decoder + +'_0 + +Q'_0...m + +OCR'_0...T + +Img'_0...V + +'_1 + +... + +Document Encoder + +Img'_1...V + +OCR_1..T + +Bbox +Token + +OCR'_1...T + +Document Encoder + +OCR_0...T + +_0 + +Q'_1...m + +Bbox +Img_0...V + +_1 + +Token + +Img_1...V + +Q +How much does Solardyne +still owe GroSolar? + +... +A + +Figure 2.6. Hi-VT5 architecture for multipage, extractive DocVQA. + +instruction-tuning on long-context data. Naturally, the computational cost will +increase with the length of the input data, yet recently many advances have +made subquadratic complexity feasible (e.g. relative positional encodings [382], +ALiBi [374], Flashattention [82], multi-query attention [9] etc.). [102] provides +an overview of the SOTA in long-range Transformers for DU tasks. A recent +approach [44] proposes a hierarchical architecture to model both local pagelevel attention and global document-level attention on learnable document-level +tokens, with an additional compression module to scale to 100+ pages while +keeping latency low. +2.3.4.2 + +Document Structure Modeling + +Representing structured documents as plain text resulting from OCR is not +congruent with how humans perceive documents [294], which is the focus of +Chapter 6. Document layout is a valuable cue to navigate a document’s structure +and find information more efficiently, but it is not always modeled properly, +with most methods relying on geometric features (1D/2D absolute positional + + \ No newline at end of file diff --git a/assets/txts/pg_0073.txt b/assets/txts/pg_0073.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed453cdfddf8fc729002479e4c4474f5b6a6d540 --- /dev/null +++ b/assets/txts/pg_0073.txt @@ -0,0 +1,47 @@ +INTELLIGENT AUTOMATION + +41 + +encodings) that are not robust to OCR errors, nor are they able to capture the +semantic complexity of document layouts. +There are great recent advances from better layout modeling, e.g., modeling +relative positions with polar coordinates and layout attention with Gaussian +biases [555], and DocLLM [480] ignoring visual features to focus on disentangling +the layout structure from the document text, which are promising directions +for future research. + +2.4 + +Intelligent Automation + +Automation is the use of technology to perform tasks with reduced human +assistance. +Throughout history, humankind has experienced waves of +automation, from the invention of the wheel to the steam engine, the assembly +line, and the computer. Manual labor in particular, performed by blue-collar +workers, has been increasingly automated since the 20th century. When applied +to knowledge work as performed by white-collar workers, more through the use +of software than hardware, it is referred to as Intelligent Automation (IA, +not to be confused with the French acronym of ‘intelligence artificielle’) [1]. +IA is a rapidly growing field, with the market for hyperautomation-enabling +technologies projected to have reached nearly $ 600 billion in 2022, a 24% +increase from 2020 [392]. A recent survey [135] does show that IA adoption is +lagging behind expectations, with only 19% of organizations having deployed +their automation programs and 38% in the planning stage. +[48] identified 5 key trends in IA: 1) the rise of the digital workforce, 2) the +emergence of the digital twin, 3) the importance of data, 4) the need for +orchestration, and 5) the rise of the citizen developer. The first three trends +are particularly relevant to the work presented in this thesis. +IA is a subset of Artificial Intelligence (AI) specifically designed for the +automation of knowledge work. It encompasses several technologies, including +Robotic Process Automation (RPA), which can be thought of as software +to automate routine tasks, and Workflow & Business Process Management +(BPM). When combined with people and organizations, these technologies are +capable of solving major world problems [48]. +The goal of IA is to create a software-based digital workforce by mimicking +the four main human capabilities required to perform knowledge work: vision, +language, thinking & learning, and execution. This allows for the construction +of straight-through business processes, which are more efficient in terms of +productivity, processing speed, and cost, and often more effective in terms of + + \ No newline at end of file diff --git a/assets/txts/pg_0074.txt b/assets/txts/pg_0074.txt new file mode 100644 index 0000000000000000000000000000000000000000..077f3f59fdf78e2694adcd83acb7e90fb437c61f --- /dev/null +++ b/assets/txts/pg_0074.txt @@ -0,0 +1,33 @@ +42 + +FUNDAMENTALS + +quality and logic. The ultimate aim is not to replace human workers, but to +take the robot out of the human, augmenting human intelligence, creativity, +and productivity. +IDP/DU is a prototypical example of an IA use-case, as it frees workers from +paperwork, allowing them to focus on more value-adding tasks, thereby providing +a clear perspective on the future of work. Finally, we provide an overview of the +requirements for setting up IA, linking back to all technical concepts introduced +before. +Enabling IA requires well-defined CSFs and either operational thresholding to +determine the trade-off between automation and risk, or a selective prediction +setup. When a system is deployed in production, it also requires robustness to +distribution shifts, both expected and unexpected, and the ability to detect and +predict a wide variety of failures. +Measuring IA is performed using calibration metrics and confidence ranking +metrics. Calibration is the degree to which a model’s predicted probabilities +match the true probabilities of the events it predicts. Confidence ranking is the +degree to which a model’s predicted probabilities are ranked in accordance with +the true probabilities of the events it predicts. If the i.i.d. assumption becomes +violated, the model’s confidence ranking will be affected, and the model will +be overconfident on OOD inputs. As part of the deployment process, it is +important to monitor the model’s performance and to detect when it starts to +fail, where other metrics are more appropriate. +Improving IA Improvements to IA can be made by inducing calibration +through post-hoc strategies or designing calibrated loss functions, as well as +through predictive uncertainty estimation for model selection and capturing +issues with the data or model before deployment, and all investments in failure +prediction will be rewarded with more robust and reliable systems. + + \ No newline at end of file diff --git a/assets/txts/pg_0075.txt b/assets/txts/pg_0075.txt new file mode 100644 index 0000000000000000000000000000000000000000..393c09d7fae9bf78c59749d67bebc1ad97748ad8 --- /dev/null +++ b/assets/txts/pg_0075.txt @@ -0,0 +1,8 @@ +Part I + +Reliable and Robust Deep +Learning + +43 + + \ No newline at end of file diff --git a/assets/txts/pg_0076.txt b/assets/txts/pg_0076.txt new file mode 100644 index 0000000000000000000000000000000000000000..88f11ad0b8f5ed4fd785415980b3a0c8f86d6be6 --- /dev/null +++ b/assets/txts/pg_0076.txt @@ -0,0 +1,22 @@ +Chapter 3 + +Benchmarking Scalable +Predictive Uncertainty in +Text Classification +The contents of this chapter come from two publications [465, 466]: +Jordy Van Landeghem, Matthew B Blaschko, Bertrand Anckaert, and MarieFrancine Moens. Predictive Uncertainty for Probabilistic Novelty Detection in +Text Classification. In ICML Workshop on Uncertainty and Robustness in Deep +Learning, 2020 +Jordy Van Landeghem, Matthew Blaschko, Bertrand Anckaert, and MarieFrancine Moens. Benchmarking Scalable Predictive Uncertainty in Text +Classification. IEEE Access, 2022 +The first publication started as a reproduction of [500] with a deeper focus on +text classification, and the second publication is a large journal extension of the +first publication. +This chapter focuses on how to quantify uncertainty in text classification +tasks, which is a prerequisite to trust a model’s predictions in real-world +applications such as intent classification in automated document processing +based on the document text. We conduct a benchmarking study of uncertainty + +44 + + \ No newline at end of file diff --git a/assets/txts/pg_0077.txt b/assets/txts/pg_0077.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bfce3597f3e4b91372d0bb2c79eb3358705ff84 --- /dev/null +++ b/assets/txts/pg_0077.txt @@ -0,0 +1,44 @@ +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +45 + +estimation methods applied on 6 real-world text classification datasets, including +both multi-class and multi-label classification, with 1-D convolutional neural +networks and pretrained transformers. The experiments empirically investigate +why popular scalable uncertainty estimation strategies (Monte-Carlo Dropout, +Deep Ensemble) and notable extensions (Heteroscedastic, Concrete Dropout) +underestimate uncertainty, and how to improve their performance. We motivate +that uncertainty estimation benefits from combining posterior approximation +procedures, linking it to recent research on how ensembles and variational +Bayesian methods navigate the loss landscape. +We find that our proposed method combination of Deep Ensemble with Concrete +Dropout, by analysis of in-domain calibration, cross-domain classification, and +novel class robustness, demonstrates superior performance, even at a smaller +ensemble size. Our results corroborate the importance of fine-tuning dropout +rate to the text classification task at hand, which individually and as an ensemble +impacts model robustness. We observe in ablation that pretrained transformers +severely underperform in novelty detection, limiting the applicability of transfer +learning when distribution shift from novel classes can be expected. +Supporting context: As the publications were written at the start of my PhD, +we take the opportunity here to give an update on the state of the art and the +relevance of our work in uncertainty estimation research. +The journal extension was motivated as a survey and benchmark of scalable +Bayesian Deep Learning methods, in which we introduced novel hybrid +models and evaluated uncertainty estimation quality under distribution shift +configurations. We also provide a convenient entry point for practitioners, as +our benchmarking software is available online (https://github.com/JordyVL/uncertainty-bench). Our work has also been re-used as the basis of a +conference tutorial [524, https://sites.google.com/view/uq-tutorial]. +In similar spirit as our work, new benchmarks have put different aspects of +reliability and robustness to the test: Shifts [306] focuses on the robustness +of uncertainty methods to real distribution shifts in large-scale tasks across +overlooked modalities such as tabular, audio or sensor data, WILDS [220, 401] +curates a collection of labeled and unlabeled datasets exhibiting distribution +shifts in the wild, OpenOOD [527] generalizes a comprehensive benchmark +for out-of-distribution detection, anomaly detection and open-set recognition, +and finally, PLEX [455] probes pretrained models on their ability to estimate +uncertainty, exhibit robustness under shifts, and adapt in settings of active, +few-shot and life-long learning. +The supremacy of ensemble methods has been challenged by the recent +publication of [346], which proposes a new method for uncertainty estimation in + + \ No newline at end of file diff --git a/assets/txts/pg_0078.txt b/assets/txts/pg_0078.txt new file mode 100644 index 0000000000000000000000000000000000000000..bad1d53568433f0137cd1b81a47c9797f5198ae5 --- /dev/null +++ b/assets/txts/pg_0078.txt @@ -0,0 +1,47 @@ +46 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +NNs, called EpiNet. The authors claim that their non-Bayesian method is able +to discern the difference between ambiguity or lack of data. Key ingredients are +a dyadic sampling procedure, which creates interesting data pairs that are used +to train a NN to predict the epistemic uncertainty, and a small architecture that +can supplement any conventional NN to improve OOD detection and active +learning [413]. Another competitive method [326] concentrates on feature-space +density estimation under the assumption of smoothness and sensitivity, with +their efficient baseline disentangling epistemic (Gaussian Mixture Model fit on +training features, with a separate covariance matrix per class) and aleatoric +uncertainty (entropy of softmax distribution). Other promising methods target +aleatoric uncertainty, such as [75, 474] which focus on label noise or ambiguous +tasks such as toxicity detection. +An important observation on the benefits of Bayesian NNs concerns the dataset +and model size, particularly Bayesian modeling shines in dynamic settings where +the size of the model/data are unknown or change over time [346], e.g., online, +continual, active and life-long learning. In static settings with high accuracy on +a fixed test set, the benefits of Bayesian modeling are less pronounced [215]. +Next to PUQ, alternative approaches have sought to learn explicit scoring +functions [200, 351] or assess the similarity of inputs to the training distribution +[54, 271, 285, 379, 487]. All efforts have recently increased in popularity, as +uncertainty estimation has become even more important for safe deployment of +LLMs in user-facing applications [111]. + +3.1 + +Introduction + +Reliable uncertainty quantification is indispensable for any machine learning +system trusted in decision-making in many application domains such as medical +diagnosis, self-driving cars and automated document processing. In any typical +industrial application, we desire predictive uncertainty to communicate on the +model’s lack of in-domain knowledge due to either training data scarcity or +model design errors, or its ability to flag potentially noisy, shifted or unknown +input data (see [136] for more detail on sources of uncertainty). +Supervised Deep Learning (DL) algorithms have been found to provide +“catastrophically overconfident predictions” [116] under data distribution shift. +Specifically, novel class distributions can emerge at inference time [367], which +desirably should be detectable in a model’s uncertainty. To this end, scalable +Bayesian DL (BDL) methods for uncertainty estimation have been recently +developed, generating increased interest from practitioners in need of practical +solutions. BDL comprises an increasingly large range of theoretically well- + + \ No newline at end of file diff --git a/assets/txts/pg_0079.txt b/assets/txts/pg_0079.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d8c790af013c6057c75b8345a20a41adbd01ff9 --- /dev/null +++ b/assets/txts/pg_0079.txt @@ -0,0 +1,44 @@ +INTRODUCTION + +47 + +motivated predictive uncertainty methods (PUQ), yet only some are able to +scale in network architecture and dataset size. Additionally, most surveys +and research output on predictive uncertainty is based on multi-class image +classification or regression experiments. We argue that predictive uncertainty +methods and how well they scale in Natural Language Processing (NLP), for +text classification tasks, is still an under-explored question. +The context of our study is a production-level text classification system for +automatically handling incoming communications in information-intensive +industries (e.g. legal, banking, insurance). Imagine a digital-first company +where each department has its own document classifier operating under a closed +world assumption. However, whenever a client mistakenly sends a document (car +purchase invoice requesting a loan) to the wrong department (say underwriting +or medical claims), this can generate high-confidence false positives that trigger +the wrong action (insurance or claim settlement instead of loan application). +Similarly, if an insurance broker suddenly decides to completely change the +document template that clients use to apply for a car loan, the production +model might not find previously salient features which it had learned to rely on +for accurate classification. This shows that detection of anomalous inputs and +shifting distributions is critical to keep errors in automation low. +We investigate different techniques and procedures for incorporating uncertainty +into DL models for text classification, analyzing the degree to which they can +reliably capture uncertainty under extrapolation (outside the support of the +training set), both individually and combined in an ensemble. Our findings for +individual predictive uncertainty methods are overall consistent with benchmarks +in other modalities, with Deep Ensemble reporting greater robustness than +approximate Bayesian methods. However, we discover from empirical findings +that our newly proposed combinations, particularly MC Concrete Dropout +Ensemble, can push the bounds by exploiting the in-domain calibration effect of +Concrete Dropout and all-round ensemble qualities for increased out-of-domain +and novel class robustness. +We intend our work to be used as a survey and benchmark of scalable BDL +methods, where the architectures and datasets are drawn from NLP, thereby +covering a void in the literature on uncertainty estimation in this field. Next to +proposing a well-motivated evaluation methodology, this chapter also provides +a convenient entry point for practitioners.1 +Our key contributions can be summarized as follows: +• We conduct a benchmarking study of established uncertainty estimation +1 Our benchmarking software [TensorFlow 2] is available at https://github.com/JordyVL/uncertainty-bench + + \ No newline at end of file diff --git a/assets/txts/pg_0080.txt b/assets/txts/pg_0080.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba6922f603a88fa4da9f7b0203994de76f6b2d99 --- /dev/null +++ b/assets/txts/pg_0080.txt @@ -0,0 +1,46 @@ +48 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +methods applied on real-world text classification datasets. Our analysis +focuses on model robustness and uncertainty quality in realistic data +distributions. We propose a practical methodology to test the above, +resulting in a better understanding of the individual shortcomings of +predictive uncertainty methods. +• We motivate and introduce novel combinations of predictive uncertainty +methods, providing empirical evidence for their complementary benefits. +Through statistical analyses and ablation experiments we discern the +importance of certain prior, model or hyperparameter influences on the +reliability of predictive uncertainty. +Organization The paper is organized as follows. Section 3.2 overviews +related work in uncertainty benchmarking, distribution shift, and uncertainty +estimation in NLP. We present core concepts of BDL in Section 3.3 to build +up a thorough understanding of predictive uncertainty in theory and practice. +We include this introductory text for readers less familiar with uncertainty +methods. Section 3.3.5 critically analyzes the practice of evaluating uncertainty +under distribution shift. Sections 3.3.4 and 3.4.1 stand central in our work, +connecting recent research on how neural networks navigate the loss landscape +with posterior approximation procedures, followed by our work’s hypotheses on +complementary benefits between predictive uncertainty methods. +Section 3.4 details our methodological setup from datasets, model architectures, +uncertainty estimation and evaluation, to experimental settings. We present +in Section 3.5 the results of 3 large benchmarking experiments, followed by +4 smaller ablation studies on important hyperparameters. After closing the +discussion in Section 3.6 with take-home messages targeting researchers and +practitioners interested in uncertainty prediction in text classification, Section 3.7 +details additional experiments, and Section 3.8 draws up some limitations of our +research. Finally, we synthesize our contributions in Section 3.9 and propose +directions for future work on uncertainty research in NLP. +The Appendices support the main text by detailing implementation (A), +practical considerations (compute, timings) (B), and detailed evaluation data +for full transparency (C). + +3.2 + +Related Work + +In this Subsection, we overview recent literature on benchmarking the quality of +uncertainty quantification in DL and more specifically research on uncertainty +estimation for NLP tasks. + + \ No newline at end of file diff --git a/assets/txts/pg_0081.txt b/assets/txts/pg_0081.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb341f5319a7e28801cf477f8c01954fb53bf62c --- /dev/null +++ b/assets/txts/pg_0081.txt @@ -0,0 +1,47 @@ +RELATED WORK + +49 + +Increasingly, there are efforts from the research community to help BDL methods +scale to real-world scenarios [205]. Benchmarks are an important tool to +help researchers prioritize the right approaches and to inform practitioners +which methods are suited for their applications [276]. There is a growing +demand for benchmarking in BDL, since methods must be scored both for task +performance and uncertainty quality [411, 496]. Rigorously evaluating the latter +is considerably more difficult, since depending on the problem setting no direct +uncertainty ground-truth exists, requiring a well-defined experimental setup +[323]. +A standard benchmark in BDL is UCI [176], a set of curated regression datasets, +which allows to judge uncertainty quality with the predictive log-likelihood +metric. However, its general applicability and validity has been criticized on +multiple accounts [113, 323, 360]. +More recently, [19, 113, 301, 348, 462] presented large-scale evaluation studies +of BDL methods with benchmarking on real-world datasets. These studies +motivate data retention and distribution shift as generic protocols for evaluating +predictive uncertainty. Similarly, we argue that even mild shifts of data are +unavoidable in real-world applications and, conditional to specific distribution +shift assumptions (see Section 3.3.5), this provides a good testing ground for +uncertainty evaluation. +[348] consider two types of distribution shift: (a) out-of-distribution (OOD) data +from separate datasets, and (b) adversarial shift, where the test distribution +consists of perturbed or corrupted ground truth data isolated from training. +In our work we propose novel class detection as an alternative to a), which we +motivate to be a more representative experimental setup for testing uncertainty +in text classification (more detail in Subsections 3.3.5 and 3.4.5.3). [142] bring a +similar argument against b) that adversarial examples are often overly synthetic +and disconnected from real-world performance concerns, which we assert to be +especially true for perturbations applied to text data. Therefore, we derive a +challenging experimental setup for b) (more detail in Section 3.4.5.2) inspired +by the extensive literature in NLP on the problem of domain shifts and domain +adaptation [45, 84, 129, 203, 388, 557]. Domain adaptation approaches aim to +mitigate performance degradation that occurs when transferring a classifier from +a source domain to a target domain. Learning under domain shift presents a +complex challenge in text classification since linguistic patterns can be highly +different across domains, even harder to tackle when domains are unknown a +priori [388]. While out-of-domain generalization is the ultimate objective [18], +we believe that accurate uncertainty prediction has a major role to play in the +detection of out-of-domain data, which is currently under-explored. [488] is a +notable exception where predictive uncertainty methods are leveraged to learn +domain-invariant features in unsupervised fashion. + + \ No newline at end of file diff --git a/assets/txts/pg_0082.txt b/assets/txts/pg_0082.txt new file mode 100644 index 0000000000000000000000000000000000000000..69a6aeba724bd5f62c752e341020c0bd26ec88ae --- /dev/null +++ b/assets/txts/pg_0082.txt @@ -0,0 +1,47 @@ +50 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +In this work we only consider methods that directly estimate the predictive +posterior and aim at obtaining high quality uncertainty estimates by +discriminative models without any additional OOD components. However, +there exists a large number of alternative OOD detection and generalization +approaches. We surmise that these can be more effective in handling the +above distribution shifts, yet they have different modeling assumptions which +complicates a direct comparison, for instance, access to (auxiliary) OOD data +[271, 285], generative modeling [334], focus on abstention mechanisms [138], or +characterization of dataset shifts with a two-sample-testing approach [379]. We +recommend [54, 414] for an overview of these approaches. +While previous BDL benchmarks have helped standardize protocols, metrics and +analysis tools, the effort is not spent equally across all modality and problem +settings (as can be observed in the survey of [4]). Arguably, most research on +uncertainty estimation focuses on regression and image classification tasks as +they offer visual validation on uncertainty quality, e.g., [214]. +Tasks in the NLP field involve discrete natural language units (word, sentence, +paragraph) as input, which requires a translation to the continuous domain by +embedding discrete units to form high-dimensional distributed representations +[321]. This presents additional complexity compared to image or time-series +data which as continuous signals can be directly fed into a Neural Network +(NN). Furthermore, specialized algorithms (e.g., dealing with long sequences, +attention for larger memory [473]) and progressively more complex architectures +[27] are being created to tackle this unique challenge in NLP, which can affect +the performance of predictive uncertainty techniques. With our work, we +start the exploration into effects of field characteristics, notably different NLP +architectures, inherent task complexity, and properties of language in text +processing (e.g., ambiguity [397], document length [478], pre-defined vocabulary +[68]) that could cause problems when predicting uncertainty. More specifically, +we seek to answer how uncertainty research translates to a prototypical language +task such as text classification, which more frequently than vision tasks is +characterized by non-mutually exclusive labels [312], a problem setting ignored +by existing BDL benchmarks. +BDL research on NLP tasks is generally limited, certainly when considering +quantitative evaluation of predictive uncertainty quality. While we draw +inspiration from the uncertainty estimation methods of [500], their study focuses +on the performance increase of non-probabilistic measures (mean-squared error) +and only reports sentiment regression results. Moreover, we find no quantitative +evaluation of the quality of the uncertainty scores and comparison to simpler +measures of uncertainty, for instance, softmax score or predictive entropy. [174] +does focus on the robustness of pretrained Transformers to distribution shift, yet +without application of any predictive uncertainty methods. [322, 533] present + + \ No newline at end of file diff --git a/assets/txts/pg_0083.txt b/assets/txts/pg_0083.txt new file mode 100644 index 0000000000000000000000000000000000000000..65f33eb738e4cd8897991e856b5f6c6be45d041c --- /dev/null +++ b/assets/txts/pg_0083.txt @@ -0,0 +1,47 @@ +UNCERTAINTY METHODS + +51 + +similar setups applying Monte Carlo Dropout to regular NLP architectures in an +active learning setup, yet they only aim to increase overall predictive performance +by relying on in-domain calibration. Our work benchmarks individual and joint +predictive uncertainty methods in multiple text classification task settings +over two well-motivated uncertainty evaluation setups, testing robustness to +distribution shift for NLP problems. + +3.3 + +Uncertainty Methods + +The first Subsection formally presents how to quantify uncertainty in BDL +and how popular methods approach inference differently. Section 3.3.2 treats +predictive uncertainty methods with a focus on the algorithmic procedure, +followed by representative method extensions for more reliable uncertainty +estimation. Section 3.3.3 describes from what sources uncertainty originates +and how to quantify uncertainty at test-time. In Section 3.3.4 we present the +rationale of our study, connecting recent research on how NNs navigate the +optimization landscape with the posterior approximation procedure of methods +from Section 3.3.2. Section 3.3.5 provides a critical note on how distribution +shift impacts uncertainty estimation and the evaluation thereof. + +3.3.1 + +Quantifying Uncertainty in Deep Learning + +In modern Deep Learning, two common uncertainty (or inversely “confidence”) +estimates are the maximum posterior class probability, known as softmax-score, +and the predictive entropy over posterior class probabilities [415, 522]. However, +[156]’s work on confidence calibration demonstrated these to be unreliable +estimates of Neural Networks’ uncertainty. While post-hoc calibration methods +such as Temperature or Vector Scaling [156, 419] can easily calibrate classifier +uncertainty in-domain (further discussed Section 3.3.5), they have been found +to be less effective under increasing distribution shift [19, 348]. +Bayesian Deep Learning (BDL) methods build on solid mathematical +foundations and hold promise for more reliable learned uncertainty estimates +[496]. Drawing on the ground-laying works of [91, 179, 299, 300, 337], the +“second-generation” in BDL [140] is geared towards finding practical and +scalable approximations to the analytically intractable Bayesian posterior +(Equation (3.1)). Inferring a prediction and the associated uncertainty for +a new test input x∗ (with its associated label vector y ∗ ) requires computing the + + \ No newline at end of file diff --git a/assets/txts/pg_0084.txt b/assets/txts/pg_0084.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7d4a938482c516a356f411ab6fe2a2cc237fe25 --- /dev/null +++ b/assets/txts/pg_0084.txt @@ -0,0 +1,60 @@ +52 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +conditional probability of x∗ given the training data D = +P (y ∗ | x∗ , D) = + +Z + + + +x(n) , y (n) + +P (y ∗ | x∗ , D, θ) P (θ | D) dθ, +| {z } + + + +N +n=1 + +, + +(3.1) + +posterior + +with θ representing all Bayesian Neural Network (BNN) parameters: weights w, +biases b. +In our study we will focus on two strategies with representative methods that +circumvent the inference problem and have seen more widespread adoption +given their ability to scale both in network architecture and dataset size. +I. The weight snapshots direction, Deep Ensemble [238], which aims +to find different sets of model parameters. Snapshots can be collected +during different stages of training [133, 186, 301], or by using a sampling +process such as Markov Chain Monte-Carlo (MCMC) [141, 180, 530]. II. +The stochastic computation-graph direction, Monte Carlo Dropout +[124], involves introducing noise over weights during training and estimating +uncertainty with multiple stochastic forward passes. Recent works [283, 464] +have proposed "single-model" uncertainty methods that ideally compute posterior +uncertainty in one forward pass. +Our work benchmarks representative methods from both categories (denoted by +cursive), motivating a cross-category comparison and analyzing their individualjoint effectiveness in modeling predictive uncertainty. +Additionally, we later experimented with alternative scalable uncertainty +methods, namely stochastic gradient MCMC methods, cyclical SG-MCMC +(cSG-MCMC) [530], and a single forward pass uncertainty method incorporating +a Gaussian Process (GP) output layer, Spectral-normalized Neural Gaussian +Process (SNGP) [283]. Results and discussion for these are included as a +self-contained subsection Section 3.7. + +3.3.2 + +Predictive Uncertainty Methods + +We will first introduce each method by explaining the algorithm, followed by +advantages or identified shortcomings, with subsequent method extensions from +the same procedure category. Finally, we will zoom in on how to quantify +uncertainty using each method. + + \ No newline at end of file diff --git a/assets/txts/pg_0085.txt b/assets/txts/pg_0085.txt new file mode 100644 index 0000000000000000000000000000000000000000..8384c3fbe3cb1fc2c9473bb09e31e394bdf73311 --- /dev/null +++ b/assets/txts/pg_0085.txt @@ -0,0 +1,67 @@ +UNCERTAINTY METHODS + +3.3.2.1 + +53 + +Monte Carlo Dropout + +The seminal work of [124] on Monte Carlo Dropout (MC Dropout, MCD) proposes efficient model uncertainty estimation by exploiting dropout regularization +as an approximate Variational Inference (VI) method. In practice, the MCD +procedure boils down to (i) applying dropout on all non-linear layers’ weights, +and (ii) activating dropout both during training and evaluation. Quantifying +“epistemic” model uncertainty using MCD involves sampling T stochastic weight +sets from the variational Bernoulli distribution θ̂t ∼ q(θ) to calculate the +lower-order moments of the approximate Gaussian posterior, respectively the +predictive mean and variance (Equation (3.2)). +µ̂pred (x∗ ) = + +T +1X +P (y ∗ |x∗ , θ̂t ), +T t=1 + +(3.2) + +T +1X +2 +σ̂pred +(x∗ ) = +[P (y ∗ |x∗ , θ̂t ) − µ̂pred ]2 +T t=1 + +MCD’s simplicity and computational tractability, i.e., dropout training is a +standard DL practice and prediction only requires 1 model to sub-sample from, +has made it one of the most popular predictive uncertainty methods. However, +an important shortcoming of VI, and in consequence MCD in [124]’s formulation, +is that it is known to underestimate predictive variance [459]. We will touch on +a selection of method extensions in Sections 3.3.2.3 and 3.3.2.4. +3.3.2.2 + +Deep Ensemble + +Deep Ensemble [238] (DE) involves independently training multiple NNs +with different random weight initializations and aggregating predictions from +individual models. An ensemble of NNs trades off computational resources, +due to the need to train and store M models, for uncertainty estimation and +robustness to dataset shift [163, 348, 489]. In comparison to MC Dropout, +DEs are treated as a uniformly-weighted Gaussian Mixture model, to which the +formula for predictive variance is adapted: +2 +σ̂pred +(x∗ ) = + + +1 X 2 +σθm (x∗ ) + µ2θm (x∗ ) − µ2∗ (x∗ ), +M m +1 X +µ∗ (x ) = +µθm (x∗ ) +M m +∗ + +(3.3) + + \ No newline at end of file diff --git a/assets/txts/pg_0086.txt b/assets/txts/pg_0086.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed9b964b92e085013e8dea868e5a35ad317d908c --- /dev/null +++ b/assets/txts/pg_0086.txt @@ -0,0 +1,50 @@ +54 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +The empirical performance increase of ensembles can be attributed to the +diversity of uncorrelated errors between ensemble members [225]. Without +functional diversity in sets of model parameters, posterior approximation quality +will be lower (zero variance) and for this reason, ensemble diversity promotion +is a promising avenue for further improvements [49, 196]. +Alternatively, +the interplay between ensembling and regularization, "the effect of a prior", +warrants more thought, since not regularizing risks overfitting, while too strong +regularization risks constraining diversity (see Section 3.3.4). +3.3.2.3 + +Concrete Dropout + +[125] proposes a Continuous-discrete distribution relaxation to adapt and +optimize the dropout probability p as a variational parameter using standard +gradient descent. This overcomes the limitations of uncertainty underestimation, +miscalibration, and the computational complexity of manually tuning layerwise dropout probability in deeper models [345]. By taking advantage of +the reparametrization trick, the Concrete distribution approximation z̃ of the +original Bernoulli random variable z conveniently parametrizes to a simple +sigmoid distribution (φ = sigmoid) allowing for gradient-based optimization. +Given a uniform random noise variable u and a temperature r, the expression +varies with respect to the dropout probability p, which for p → 0.5 produces by +a rate of 1r values approaching 1. + + +1 +(log p − log(1 − p) + log u − log(1 − u)) +(3.4) +z̃ = φ +r +Since the dropout probability characterizes the overall posterior uncertainty, +Concrete Dropout can positively influence in-domain calibration at an almost +negligible cost. +3.3.2.4 + +Heteroscedastic Extensions + +[213, 236, 500] proposed similar approaches to extend MC Dropout to allow +measuring uncertainty information from different sources. Estimating inputdependent, “heteroscedastic aleatoric”, data uncertainty (detail Section 3.3.3) +requires slightly modifying the model’s architecture and objective function +following [213]. +Firstly, the output layer of model fθ̂ is extended with a set of learnable variance +variables σ 2 per unique class output. The model’s output logits, v, are sampled +from the stochastic output layer parametrized by N (fθ̂ (x), diag(σ 2 (x))). This + + \ No newline at end of file diff --git a/assets/txts/pg_0087.txt b/assets/txts/pg_0087.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf3e7db05acea5bf4a62c497e83fda7b66f305a3 --- /dev/null +++ b/assets/txts/pg_0087.txt @@ -0,0 +1,61 @@ +UNCERTAINTY METHODS + +55 + +model adaptation will be referred to as the heteroscedastic model. Fig. 3.1 +visualizes the difference in output layer design. + +Figure 3.1. +Visualization of output layer blocks. +The left block denotes +standard softmax (multi-class) or sigmoid (binary/multi-label) output. On the +right, the heteroscedastic model outputs a normal distribution N (µ(x), diag(σ 2 (x)) +parametrizing mean and variance by the logits coming from two separate preceding +feedforward layers. + +Next, it requires incorporating a heteroscedastic loss: +T +K +X +1X +(t) +(t) +exp vi,c − log +exp vi,k +LHET (θ̂) = +log +T +t=1 +i=1 +N +X + +! ++ log T (3.5) + +k + +with N the number of training examples passing through an instance t of +the model fθ̂t (x) + σ (t) ((2 omitted for sampling superscript) to generate for +(t) + +example i a sampled logit vector vi ∈ RK , where predicted value for class k, +(t) +vi,k ∈ R, and c the index of the ground truth class. The above loss formulation +shares notation with a categorical cross-entropy objective, although the loss is +(t) +computed over T sampled logits vi perturbed with parameterized Gaussian +noise. By learning to predict log variance over T dropout-masked samples, the +model will be able to output high variance (uncertainty) for inputs where the +predictive mean is far removed from the true observation, which by design has +a smaller effect on the total loss. + +3.3.3 + +Uncertainty Estimation + +In this Subsection, we will introduce sources of uncertainty, a categorization of +uncertainty measures, and how uncertainty is quantified in practice. +Total Uncertainty Classification models trained by minimizing negative loglikelihood quantify global uncertainty over class outcomes with entropy (H) over + + \ No newline at end of file diff --git a/assets/txts/pg_0088.txt b/assets/txts/pg_0088.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3d00f6bebf93190db8d9b83421c62320812cd1a --- /dev/null +++ b/assets/txts/pg_0088.txt @@ -0,0 +1,48 @@ +56 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +logits. Therefore, the entropy of the posterior predictive distribution provides +a measure of the total uncertainty, which is a combination of model and data +uncertainty [190]. Instead of entropy, posterior predictive variance can also be +decomposed into model and data uncertainty using the law of total variance +[92]. Decomposing total uncertainty into the different sources is beneficial for +determining actions to evaluate the room for improvement. +Model Uncertainty Epistemic uncertainty presents the inherent ignorance +[345] of the model with regards to the true values for its parameters and +structure after having seen the training data. Next to predictive variance, +Mutual Information (MI) [426] has been proposed as a measure of epistemic +uncertainty, as intuitively it captures the amount of information that would be +gained about model parameters through “knowledge” of the true outcome [305]. +Data Uncertainty Aleatoric uncertainty captures the inherent stochasticity +and noise in data. It can be further decomposed into a homoscedastic component, +which represents constant noise over inputs such as the numerical accurateness +of a measuring device, and heteroscedastic uncertainty representing inputdependent noise generated by class overlap, complex decision boundaries or +label noise [92]. Heteroscedastic data uncertainty allows for the expression of +instance-level uncertainty together with the best possible prediction. +Here follows a categorization of the +Uncertainty categorization +uncertainty measures from methods (and combinations) of Section 3.3.2. We +directly provide estimators for the theoretical quantities that are defined as +either arising from entropy or variance-based uncertainty decomposition in +[92]. To estimate for a new test sample x∗ the prediction and uncertainty of +model fθ̂ (x∗ ) we typically seek to obtain the predictive posterior distribution +P (y ∗ |x∗ , θ̂) over class membership probabilities with yk∗ ∈ {1, . . . , K}. +T + +1X +P (y ∗ |x∗ , θ̂t ), +T t=1 +with prediction obtained after applying softmax/sigmoid function for sample +t, p̂t = P (y ∗ |x∗ , θ̂t ). For Deep Ensemble, the above notations would require a +change from T to M , but for consistency over quantity formulas, we maintain T +to denote posterior sampling. For ease of notation, we define a helper entropy +K +X +function on H(x∗ , ·) = − +P (yk |x∗ , ·) log P (yk |x∗ , ·) with · an input argument +For MC Dropout at inference time, we presume P (y ∗ |x∗ , θ̂) ≈ + +k=1 + + \ No newline at end of file diff --git a/assets/txts/pg_0089.txt b/assets/txts/pg_0089.txt new file mode 100644 index 0000000000000000000000000000000000000000..3abfb72e451dca948f69bf3df50fa1b2e98a3d68 --- /dev/null +++ b/assets/txts/pg_0089.txt @@ -0,0 +1,78 @@ +UNCERTAINTY METHODS + +57 + +to the function. +Quantity + +Softmax-score + +Predictive Entropy + +Mutual Information + +Formula +exp fθ̂,k (x∗ ) +S = max PK +k +exp fθ̂,j (x∗ ) +j=1 +Hpred = H(x∗ , θ̂) + +I = Hpred − + +T +1 X +H(x∗ , θ̂t ) +T +t=1 + +Model Uncertainty + +2 +σ̂model += + +T +1 X +(p̂t − µ̂pred )2 +T +t=1 + +Data Uncertainty + +2 +σ̂data += + +T +K +1 X 1 X +(t) +vark (x∗ ) +T +K +t=1 + +k=1 + +For any classification model, it is possible to compute the softmax-score and +predictive entropy. For multi-label classification, the softmax-score does not +take into account multiple winning classes and a standard approximation2 would +be to average over the sigmoid-scaled probabilities of predicted classes. +Model uncertainty can be quantified with Monte Carlo integration or the +aggregation of individual models [461]. In practice, it is quantified by either (a) +calculating the average sigmoid/softmax variance over the predictive mean +from MC samples (Equation (3.2)) or (b) computing the total variance +from an ensemble mixture distribution (Equation (3.3)). Changing to the +heteroscedastic extensions allows to quantify data uncertainty. More specifically, +data uncertainty is quantified with as “surrogate” [500] the average over variance +logits var = σ 2 (see Fig. 3.1). Whenever ensembling is applied where a single +model estimates a quantity, one typically averages over the ensemble components’ +uncertainty. +2 Intending to compare directly with multi-class results, averaging uncertainty estimates +to obtain a single summary statistic for multi-label predictions is more straightforward than +reporting class-wise results. In particular, the tested multi-label datasets share low average +label cardinality, a high degree of label correlation, and a large set of unique classes (K > 50). + + \ No newline at end of file diff --git a/assets/txts/pg_0090.txt b/assets/txts/pg_0090.txt new file mode 100644 index 0000000000000000000000000000000000000000..f58db5cfe5f5fba0bf71a3d542c96989c4bc8498 --- /dev/null +++ b/assets/txts/pg_0090.txt @@ -0,0 +1,47 @@ +58 + +3.3.4 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +Motivating Hybrid Approaches + +This Subsection will motivate the theorized complementarity of VI-based and +ensembling methods for improved uncertainty estimation and robustness. +In light of the empirical success of Deep Ensemble, recent research [118, 496] +raises an important question concerning the difference in function-space between +variational Bayesian NNs (MC Dropout and extensions) and Deep Ensemble. +Deep NNs are parametrized (typically non-linear) functions presenting a highdimensional non-convex optimization problem, which may concern widely +varying curvature and many flat regions with multiple locally optimal points +within each [255]. Applying an optimization procedure to a maximum-aposteriori (MAP) objective involves a search for parameter values (hypotheses) +for which the loss function is low by navigating the high-dimensional loss +landscape. Once model training converges, one ends up with a weight-space +solution, representing a single mode of the parameter posterior . One such mode +is a local optimum of the loss function L(θ), representing unique functions fθ +as a set of NN parameters [133]. Each mode potentially marks a meaningfully +different representation of the data. +The true posterior is generally a highly complex and multimodal distribution, +with multiple possible but not necessarily equivalent parametrizations θ able to +fit the training data. To accurately quantify posterior uncertainty, we wish to +capture as many modes or separated regions as possible [117, 496]. +Correspondingly, the common goal is to achieve reliable uncertainty and, +following the BDL paradigm, one resorts to modeling a Bayesian posterior. +What differs among the selected predictive uncertainty methods, is the form of +the prior P (θ) over model parameters and likelihood P (D|θ) [336], from which +to determine a procedure. Below we expound on the difference in posterior +approximation procedure: +• MC Dropout is a common VI procedure with Bernoulli dropout and Gaussian +(L2) priors on weight-space, assuming a posterior Gaussian distribution +from which to draw stochastic samples. VI-based methods tend to locally +approximate uncertainty surrounding a single mode, intra-modal posterior +approximation. Specifically, MC Dropout’s procedure can be interpreted as +imposing a spike-and-slab parameter prior with peaked variance [333], which +offers a plausible explanation for approximated uncertainty centered tightly +around 1 mode. +• An ensemble of NNs makes no direct assumptions on the form or distribution +of the prior and just “obtains” different samples from the parameter posterior. +It generates a series of MAP estimates which through inherent stochasticity +in weight initialization and optimization end up at different regions in weight +space, leading to functionally dissimilar but more or less equally accurate + + \ No newline at end of file diff --git a/assets/txts/pg_0091.txt b/assets/txts/pg_0091.txt new file mode 100644 index 0000000000000000000000000000000000000000..38a31fcda902f34b4eb4926ef14ca0477965689b --- /dev/null +++ b/assets/txts/pg_0091.txt @@ -0,0 +1,46 @@ +UNCERTAINTY METHODS + +59 + +modes of the solution space. Due to randomness in the optimization, some +solutions may be significantly worse than others as measured by different +metrics (e.g., accuracy vs. calibration). Ensembles are effective at exploring +the weight-space and by solving the MAP estimation problems converge to +multiple modes [117, 149], allowing for inter-modal posterior approximation. +Furthermore, by considering more possible hypotheses they will be better at +approximating multimodal posterior distributions and avoid the collapse to +a single mode [496]. +Combining both procedures is to generate a mixture over priors [119], which +in itself is again a prior, all under the same likelihood function. There is no +guarantee that a combination of methods from both procedures captures the +true posterior, yet in our work we will empirically analyse if combining inter and +intra-modal posterior approximation offers the hypothesized complementary +benefits. + +3.3.5 + +Uncertainty Calibration under Distribution Shift + +In this Subsection, we motivate the meaningfulness of evaluating uncertainty +methods under distribution shift and what restricted assumptions one should +reasonably specify to guarantee useful empirical results. +We consider the problem of detecting out-of-distribution data from a trained +classifier’s uncertainty. Let P S (x, y) and P T (x, y) denote two distinct +distributions, respectively in-domain and out-of-domain. Further we assume +the classifier f → [0, 1] trained on P S , whereas in the experimental setup we +test on a mixture distribution P(S,T ) (x, y). Given an input x from the mixture, +we test if the classifier’s uncertainty can be exploited to distinguish from which +distribution the sample comes. To be clear, in this setting we expect to detect +uncertainty arising from distribution shift and not from a lack of training data. +It can be argued that there is a relationship between both, as having few +in-domain samples complicates generalization, in turn increasing the chance of +flagging a new data point as OOD. +Uncertainty estimation is generally well-defined in the context of in-domain data +with the standard assumption that samples are independent and identically +distributed (i.i.d.). In this setting, evaluation is typically expressed in terms of +calibration (Definition 8), particularly as statistical error with respect to the +conditional expectation (Definition 7). +To obtain a reliable probabilistic classifier in the traditional i.i.d. setting, explicit +in-domain re-calibration approaches are effective [156, 229, 490]. However, there + + \ No newline at end of file diff --git a/assets/txts/pg_0092.txt b/assets/txts/pg_0092.txt new file mode 100644 index 0000000000000000000000000000000000000000..1695414ddc1821d956ae75a49885a709b9b4b9d1 --- /dev/null +++ b/assets/txts/pg_0092.txt @@ -0,0 +1,47 @@ +60 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +is no general principle which states that a classifier, however calibrated on +P S , would be calibrated on OOD data from P T . Infinitely many possible +shifts can violate the standard i.i.d. assumption at varying degrees of severity, +affecting calibration and uncertainty estimation in unpredictable ways. With +the aim of still being able to rely on a classifier’s uncertainty calibration to +predict future generalization, there is a need to relax the i.i.d. assumption. An +important condition for meaningful uncertainty estimation is to impose realistic, +yet sufficiently restrictive assumptions on the nature of distribution changes +and how P S and P T relate. The covariate shift [34, 418] assumption may be +the most widely studied when the real-world data distribution differs from the +training distribution. +Recently, [354] formalized the problem of calibrated prediction under covariate +shift with theoretical bounds on calibration transfer over domains. Critically, +related works [104, 145, 335, 349, 483] prove with importance weighting that +shared structure and high overlap in distribution support (or conversely, low +domain divergence) is crucial to upper bound the increase of calibration error +due to covariate shift. To put it plainly: while one cannot guarantee calibration +on OOD data in the general case, if domains are reasonably close one can expect +to retain (some if not most) benefits from in-domain calibration. +Specific to our work, we consider two experimental settings (Section 3.4.5) with +different distribution shift [320] between domains. Here we characterize each +with the related distribution shift assumptions. (i) Cross-domain classification, +where covariates differ P T (X) 6= P S (X), but label distributions are identical +P T (Y |X) = P S (Y |X) [418]. (ii) Novelty detection, where label distributions +disagree P T (Y |X) 6= P S (Y |X), since the label sets differ between domains +[Y ]T = +6 [Y ]S [307]. Whereas (i) is a clear case of covariate shift, we reasonably +assume for (ii) that covariates are generally close P T (X) u P S (X) and that +the overall conditional shift will be small. Rather than interpreting novelty as a +shift in label sets, one might define the probability of seeing some labels under +S as exactly zero, while under T their probability is ε > 0. In practical text +classification settings, novel class inputs will typically start occurring with small +frequency in the real-world data distribution, as well as not having completely +different syntax and semantics. This implies that ’excess’ calibration error +(defined as an expectation over the mixture) will only be impacted slightly. +Clearly specifying distribution shift assumptions is quintessential for reliably +benchmarking uncertainty methods, since the calibration of each tested method +can be affected in different ways and produce results biased towards an evaluation +configuration. In our selected experimental settings, we can justify uncertainty +calibration under distribution shift as a reasonable methodology, without making +further claims on the general applicability of this evaluation procedure. + + \ No newline at end of file diff --git a/assets/txts/pg_0093.txt b/assets/txts/pg_0093.txt new file mode 100644 index 0000000000000000000000000000000000000000..80d14fcd3f8c23183a96226bd977b6f04df336c1 --- /dev/null +++ b/assets/txts/pg_0093.txt @@ -0,0 +1,49 @@ +EXPERIMENTAL METHODOLOGY + +3.4 + +61 + +Experimental Methodology + +In this work, our objective is to reliably benchmark both existing and novel +combinations of predictive uncertainty methods in order to draw conclusions for +text classification applications. This Section describes our study’s experimental +methodology with which we generate the empirical evidence presented in +Section 3.5. Section 3.4.1 introduces our hypotheses on complementary benefits +for uncertainty estimation and details the hybrid methods. Provided the focus on +text classification tasks, Section 3.4.2 motivates a set of representative datasets, +with a specification of different text problem characteristics. Section 3.4.3 +documents two pre-selected text classification architectures, the first a simple +and more controllable configuration for uncertainty benchmarking, the second +a more complex NLP architecture for which we will compare relative gains +in robustness. To ensure correct performance benchmarking, Section 3.4.4 +summarizes the metrics used for evaluating calibration and robustness. Finally, +Section 3.4.5 expounds on the model setups and experimental settings devised +to compare predictive uncertainty methods. + +3.4.1 + +Proposed Hybrid Approaches + +This Subsection stands central in our work in which we motivate combinations +of predictive uncertainty methods. We build hypotheses on complementary +benefits from combining multiple uncertainty methods, for which we present an +overview of hybrid methods in scope of our experiments (Table 3.1). +Given the obvious parallels and differences between both procedures presented +in Section 3.3.4, we hypothesize complementary benefits for uncertainty +estimation and robustness. +A. Whereas ensembles are adept at capturing multiple modes, they do not +approximate uncertainty surrounding a single mode in solution space. +However, since there is a lot of redundancy in function space, local +neighborhood uncertainty approximation might make only a minimal +contribution to the overall posterior uncertainty. [118] validated that +applying subspace sampling on an optimized solution improves in-domain +accuracy and calibration. They note improvements relatively lower +than increasing ensemble size (M ), yet they did not analyze for joint +effectiveness. +B. A procedure can only be as good as the prior and the likelihood function, +which in approximation of the intractable parameter posterior is limited by +computational constraints (number of MC samples T , number of ensemble + + \ No newline at end of file diff --git a/assets/txts/pg_0094.txt b/assets/txts/pg_0094.txt new file mode 100644 index 0000000000000000000000000000000000000000..274b4e937d5fc1fc891875b947192bcc015dd10f --- /dev/null +++ b/assets/txts/pg_0094.txt @@ -0,0 +1,60 @@ +62 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +models M ). By lack of any specific prior constraining the optimization of +independent ensemble members, the regularization effect from VI-based +priors such as dropout may introduce smoothness [110, 369], inducing +a simpler optimization landscape with less (possibly weak) hypotheses +present. In turn, by modeling an ensemble of VI approximate posteriors +less ensemble members could be required to reach the same in/out-ofdomain performance as measured by the size and quality of captured +solutions. [118] already observed that ensembles saturate after reaching +peak in-domain performance, with suboptimal models taking over the +benefit. +C. Important to note is that the influence of the prior and variational +parameters requires fine-tuning, since over-regularization will reduce the +optimization problem to one with an over-smooth, possibly unimodal +landscape [117, 133]. This eliminates any functional diversity for whatever +ensemble size, where the solution will be overconfident. Alternatively, +since the hypothesis space for a NN is often so large, with many possible +likely models for finite data, that some posterior collapse will often be +desirable to reduce the number of considered hypotheses. [496]. +Table 3.1 summarizes all model setups and hybrid methods considered for +our experiments. The most complete combination is MC Concrete Dropout +Heteroscedastic Deep Ensemble, where each member m of the ensemble has +optimized the layer-wise dropout rate p and heteroscedastic loss LHET , with the +final predictive distribution over K classes deriving from M times T stochastic +MC Dropout samples (M x T x K). +Table 3.1. In total, we consider 18 model setups, based on combining methods and +options from each column. (*) Deterministic dropout can only combine with Deep +Ensembles. CE stands for cross-entropy loss. + +Dropout + +MC sampling + +Heteroscedastic + +Deep Ensemble + +p = 0* +p = 0.5 +Concrete + +T =1 +T = 10 + +LCE +LHET + +M =1 +M =5 + +We admit two baselines, Unregularized and Regularized. +Unregularized (p = 0) offers a clean comparison, discounting any influence of +sparsification (dropout) or normalization of weight magnitude (weight decay). +However, it possibly overfits parameters to training data. In practice, one +would always apply some combination of regularization (dropout, weight decay, +batch normalization, data augmentation, ...) to counter overfitting. Regularized + + \ No newline at end of file diff --git a/assets/txts/pg_0095.txt b/assets/txts/pg_0095.txt new file mode 100644 index 0000000000000000000000000000000000000000..c64ea8b066891f6e091fb92236ec6e50767f3e1f --- /dev/null +++ b/assets/txts/pg_0095.txt @@ -0,0 +1,104 @@ +EXPERIMENTAL METHODOLOGY + +63 + +(p = 0.5) gives an alternate point of comparison over uncertainty methods, such +that we can exclude that performance increase for an uncertainty method does +not only come from regularization, which some such as MC Dropout rely upon. +Adhering to good practices and since we build ensembles with default M = 5, +we report the mean (and standard deviation) for all individual models, making +the results more statistically reliable than comparing to 1 independently trained +model. + +3.4.2 + +Datasets + +We use six well-studied real-world text corpora characterized by a different +number of classes, classification task, and size of the documents (Table 3.2). +Table 3.2. D denotes the number of documents in the dataset, K the number of classes, +I the class imbalance ratio [444], W the average number of words per document, V +the total vocabulary size respectively. +corpus + +task + +D + +K + +I + +W + +V + +20news +IMDB +CLINC-OOS +Reuters ApteMod +AAPD +Amazon Reviews (#4) + +newswire topic +movie review +intent detection +newswire topic +academic paper subject +product sentiment + +18,848 +348,415 +22,500 +10,786 +55,840 +8,000 + +20 +10 +150 +90 +54 +2 + +5e-4 +0.03 +0 +0.14 +0.04 +0 + +240 +325.6 +8 +125.2 +145.4 +189.3 + +212,267 +115,073 +6,188 +65,035 +66,854 +21,514 + +The first three datasets share the task of multi-class classification in three +different text domains. +20News [239] is a collection of 20K newsgroup documents with balanced samples +for 20 different newsgroups. To allow for direct comparison, we use the dataset +in the benchmark format of [172]. +IMDB movie reviews [97] (imdb) is a large sentiment classification dataset +which links user-based reviews of movies with labels on an ordinal scale between +1 and 10. Since there are no standard splits for this dataset we generate +randomized (seed 42) stratified splits of 65% for training, 15% validation and +20% for testing. +CLINC-OOS (CLINC150) [240] is a recently become popular intent detection +dataset comprising 150 training sentences for each of the 150 system-supported +services. Next to this, it offers a separate Out-of-Scope (OOS) subset with +1200 natural sentences which can be used for Out-of-Domain (OOD) detection, +more specifically detecting novel class instances. This dataset differs from the +previous two through very short “intent” sentences requiring classification in a +large output space. For training and evaluation, we use the predefined splits of +TensorFlow Datasets. + + \ No newline at end of file diff --git a/assets/txts/pg_0096.txt b/assets/txts/pg_0096.txt new file mode 100644 index 0000000000000000000000000000000000000000..84eb5bcf5a65b0950a231ae721697379a61ce05a --- /dev/null +++ b/assets/txts/pg_0096.txt @@ -0,0 +1,47 @@ +64 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +We include two popular multi-label text classification datasets, since they are +often not considered for uncertainty experiments. We argue that they should +be included since their multi-label nature is very common in text classification +where not all labels have to be mutually-exclusive, e.g., topic categorization, +subject attribution, ... +Reuters ApteMod [17] is a multi-label news topic categorization dataset with +90 possible topics and an average low label cardinality (C) of 1.24. We use the +standard ApteMod splits. +Arxiv Academic Paper Dataset (AAPD) [505] comprises 55,840 computer +science paper abstracts that have been labeled with corresponding multiple +subject matters. Each academic paper has on average 2.41 subject targets with +a minimum of 2. For reproducibility purposes, we use the same preprocessing +steps and splits as in [5, 505] with 1K dev and 1K test samples. +Amazon Reviews [45] is a widely-used benchmark for domain adaptation research +in NLP. It consists of binary sentiment classification datasets from four different +domains: Books, DVDs, Electronics and Kitchen appliances. Each domain +dataset contains 1K positive and 1K negative labeled instances. Following +the convention of previous works [103, 557], we construct 12 balanced crossdomain sentiment analysis tasks, where for each source dataset we randomly +hold out 400 test instances to evaluate in-domain and always predict on the full +target dataset. We reserve this dataset for cross-domain experimentation only +(Section 3.4.5.2). + +3.4.3 + +Architecture + +This Subsection motivates the two NLP architectures in scope for the +experiments. +TextCNN architecture We use a 1-D Convolutional NN for text classification +(TextCNN), following the model structure of [218]. We chose this architecture +for its comparative simplicity and solid out-of-the-box performance on a range +of text classification tasks. Even as a light-weight model, it can deal with +feeding in text sequences of varying sizes and learning n-gram-like structures +over word embeddings, allowing a fair comparison across text datasets. An +extensive hyperparameter study determined that regularization does not impact +performance much [537]. +Transformer architecture Models in NLP have become increasingly deeper +and more complex with the advent of the Transformer architecture [473]. [94] +have combined multiple bidirectional Transformers with wordpiece tokenization +and self-supervised pretraining objectives —masked language modeling and +next sentence prediction— to create the contextual representation modeling + + \ No newline at end of file diff --git a/assets/txts/pg_0097.txt b/assets/txts/pg_0097.txt new file mode 100644 index 0000000000000000000000000000000000000000..8294eafcdbb99035e82493de49f69922f1fc8ae5 --- /dev/null +++ b/assets/txts/pg_0097.txt @@ -0,0 +1,27 @@ +EXPERIMENTAL METHODOLOGY + +65 + +architecture BERT. It allows for fine-tuning on downstream tasks where BERT +has outperformed task-specific architectures even in low resource settings. In +our experiments we use BERTbase (uncased, English): 12 layers, 768 hidden +dimensions, 12 attention heads, with a total number of 110M parameters. + +(a) TextCNN + +(b) BERT + +Figure 3.2. Simplified block-diagrams for each of the NN architectures, demonstrating +on which layer weights dropout is applied. +(a) The TextCNN model architecture with 3 kernels (K1 − 3), E word embedding +dimensionality and F number of feature maps per kernel. +(b) The BERT model architecture with L Transformers blocks, hidden size H and +number of self-attention heads A. + +Complexity TextCNN comprises only 6M parameters with most parameters +residing in the embedding matrix. However, it is restricted to a fixed window size +with the downside of not being able to determine long-distance dependencies in +text. BERT, on the other hand, has already captured prior language modeling +knowledge thanks to pretraining. Nevertheless, our experiments already involve + + \ No newline at end of file diff --git a/assets/txts/pg_0098.txt b/assets/txts/pg_0098.txt new file mode 100644 index 0000000000000000000000000000000000000000..8597058182b794103dbca8d781d485ad088799a2 --- /dev/null +++ b/assets/txts/pg_0098.txt @@ -0,0 +1,47 @@ +66 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +significant computational complexity, which is why we decided not to run all +variations with BERT. TextCNN presents a more controllable configuration, +achieving decent performance and satisfying for the evaluation of predictive +uncertainty in text classification. We include an ablation study (Section 3.5.4.2) +comparing specifically selected models trained with BERT as base architecture. + +3.4.4 + +Evaluation metrics + +Since no single metric measures all desirable properties of predictive uncertainty, +we use a variety of conventional metrics to evaluate our models’ performance, +(a) calibration metrics, b) proper scoring rules and c) classification scores. +The metrics are defined in detail in Section 2.2.3, here we will only provide a +brief description. +For in-domain evaluation, we use the following metrics: (a) Expected +Calibration Error (ECE) [156, 332], (b) Brier Score [50] and (b) Negative +Log Likelihood (NLL) [378]. We use the same metrics for out-of-domain +evaluation, with the addition of (c) AUROC and (c) AUPR for distribution +shift detection following [172]. +When evaluating a model trained in a source domain on a target domain with +a similar task, we denote accuracy in the target domain as OOD accuracy as +opposed to accuracy in the source domain, which we denote as ID accuracy. + +3.4.5 + +Experimental design + +We have determined three logical settings in text classification to evaluate +predictive uncertainty for each model setup. We present experiments on +in-domain uncertainty to form baseline results, followed by cross-domain +classification with a focus on out-of-domain detection, and finally we propose +novelty detection as a new protocol to evaluate predictive uncertainty. +While there is no gold standard procedure for comparing multiple (uncertainty) +methods over multiple (text classification) datasets, we opted for an established +procedure with statistical testing via multiple comparisons [89, 109]. Since we +present an exhaustive list of model setups, we present our results in terms of +rank and critical difference diagrams in order to analyze relative performance +of each method over different experimental settings. +Concretely, each dataset concerns independent measurements, for which we +rank each method, then compare average ranks, and in the event that we can + + \ No newline at end of file diff --git a/assets/txts/pg_0099.txt b/assets/txts/pg_0099.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb45385d2a02724c5db4a2f4cdb1c87167f945f9 --- /dev/null +++ b/assets/txts/pg_0099.txt @@ -0,0 +1,47 @@ +EXPERIMENTAL METHODOLOGY + +67 + +reject the null-hypothesis (H0 : all methods have the same rank), we calculate +post-hoc tests with critical differences over methods. However, only reporting +ranks does not allow future researchers to compare to our work, which is why +we include detailed absolute number results in the Appendix C. +3.4.5.1 + +In-domain Setting + +To evaluate in-domain (ID) uncertainty, we will focus on measuring calibration +and prediction quality with proper scoring rules (see Section 3.4.4). The ID +setting assumes that the train and test examples are i.i.d.. To capture all details, +we compare per task-setting, multi-class and multi-label, and finally zoom in on +dataset-specific observations. For the in-domain evaluation, we focus on unique +contributing effects per predictive uncertainty method and the relation between +method combinations and evaluation metrics. +• When evaluating with proper scoring rules, does an absolute increase in +combination size (higher T or M ) correlate with better performance? +• What effect —equal over all tasks, datasets or architectures— can be +discerned per unique predictive uncertainty method? +3.4.5.2 + +Cross-domain Setting + +Since we test over sentiment classification datasets from multiple domains +(Amazon Product Reviews), we seek to analyze uncertainty reliability across +domains. However, learned knowledge from a source domain can often transfer +to classification in the target domain. Provided this setting we need to account +for cross-domain generalization next to out-of-domain detection, the latter which +is the focus of our experiments. +Cross-domain generalization - how well does a classifier trained in a source +domain perform on a dissimilar target domain sharing a similar task? The aim of +cross-domain generalization is to learn a robust classifier, which can perform well +in multiple domains even if there is limited labeled data in some of the domains. +Domain discrepancy is a major challenge where, for instance, linguistic sentiment +expressions used in one domain can be different from that of the source domain. +For example, “garbage disposal” is neutral in kitchen appliances whereas a +“garbage movie” is strictly negative. This domain discrepancy challenge is often +approached by adaptation [497, 557] or encouraging domain-agnostic feature +representations [103, 129]. We propose to test out-of-domain detection with +predictive uncertainty as a viable fallback strategy when achieving generalization +over domains is difficult. + + \ No newline at end of file diff --git a/assets/txts/pg_0100.txt b/assets/txts/pg_0100.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ca3f76f0e0f92cf870d3e4f20cfc9c19cbd1a7f --- /dev/null +++ b/assets/txts/pg_0100.txt @@ -0,0 +1,44 @@ +68 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +Out-of-domain detection - how reliably can a classifier trained in a +source domain communicate uncertainty in a target domain provided good/bad +generalization? Whenever a model does not generalize to OOD examples, +we would expect a model to be uncertain, allowing detection in order to +abstain or trigger conservative fallback strategies [108]. As a proxy to good/bad +generalization we measure the gap between in-domain and target domain +accuracy as evidence of train-test skew. We argue that our current setting is +more realistic than benchmarking OOD detection in totally disparate domains +such as evaluating a newswire classifier on movie reviews. +Our analysis will be centered on the following question: +• How does domain similarity affect out-of-domain detection with uncertainty methods? Is there a clear increase of uncertainty given a higher +OOD generalization gap? +3.4.5.3 + +Novelty Detection Setting + +Novelty detection - how well can the model identify and communicate +uncertainty on samples of a novel class? In the worst case, classifiers “fail +silently” and wrongly attribute high confidence to an in-distribution class +[11, 146]. In the best case, the model either lowers its confidence or signals +uncertainty. Prior work hypothesizes model uncertainty to be mostly impacted +[213, 250]. +With this experiment we simulate the conditions of novel class data by removing +a single or multiple classes during training. The resulting distribution shift is +not too far from the original domain and cannot be considered fully out-ofdistribution (as detailed in Section 3.3.5). +We determine diverse novelty detection strategies adapted per dataset. For +20news, we follow [172, 348] and take out all odd-numbered classes to simulate +novel distribution shift. Since imdb is a sentiment classification dataset, we +isolate the middle class, rating “5” out of the 10 ratings, from training and +expect the models to allocate prediction mass to a label close to the holdout +class (ratings “4” or “6”). CLINC-OOS provides a separate out-of-scope intents +set on which we assess novel class robustness. +We devise a new strategy for the multi-label classification datasets, where we +would isolate a class that is very distinct from the remaining classes, i.e., (i) +by not appearing often in the originally multi-label annotated dataset jointly +with the remaining classes, and (ii) occurring frequently enough to guarantee +representative results. We draw statistics on the label co-occurrence rates of +each dataset, and find that for Reuters “Acquisitions” (id:0) occurs in 94% of + + \ No newline at end of file diff --git a/assets/txts/pg_0101.txt b/assets/txts/pg_0101.txt new file mode 100644 index 0000000000000000000000000000000000000000..a17f5a26d0d1fa84917ca485f5da75f598dcd3a9 --- /dev/null +++ b/assets/txts/pg_0101.txt @@ -0,0 +1,36 @@ +RESULTS + +69 + +documents as a single topic, making it an ideal candidate for testing novel class +detection. For AAPD we apply the similar strategy and find the frequent label +“CS.it” (id:0) to have relatively low label- co-occurrence (2.49), even when there +are at least 2 labels to be predicted per sample. We isolate all examples where +the novel class appears, either alone or in combination with other labels. +We focus our analysis around three specific questions concerning predictive +uncertainty under distribution shift, and compare generally to other modality +benchmarks: +• Do hybrid predictive uncertainty methods incrementally or critically +improve detection of unseen class instances? +• Does calibration in the in-domain setting translate to calibration under +distribution shift? +• Do we see the same trends as in benchmarks from different modalities +(Section 3.2)? + +3.5 + +Results + +We will present the experimental results in a step-wise manner to avoid confusion +on the conclusions to be drawn. We start with general and task-specific trends +observed for the in-domain setting, followed by the distribution shift experiments, +cross-domain classification and novelty detection. Finally, we present 4 ablation +studies on critical, learned or empirically set hyperparameter values. + +Figure 3.3. In-domain results with critical difference diagram comparing all methods +by average rank, with the calculated critical difference in the top-left and Friedman +χ2 p-value top-right. Concrete Dropout Ensemble achieves the highest NLL rank. +While comparing over 5 datasets, the critical difference is large, with only the two +aforementioned methods significantly differing from MC Dropout. + + \ No newline at end of file diff --git a/assets/txts/pg_0102.txt b/assets/txts/pg_0102.txt new file mode 100644 index 0000000000000000000000000000000000000000..c160caea739d6724ad6fb0f165cf0f40ba281170 --- /dev/null +++ b/assets/txts/pg_0102.txt @@ -0,0 +1,241 @@ +70 + +3.5.1 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +Experiment: In-domain + +Naively combining predictive uncertainty methods will not give any absolute +performance increase, as proper scoring rules show no correlation (-0.01) with +the absolute number of predictive uncertainty methods combined. This requires +deeper analysis to identify which singular or hybrid methods do significantly +outperform baselines. +First, we visualize general results with critical difference diagrams comparing +all methods by average ranking over datasets (Fig. 3.3). Critical difference (CD) +can be interpreted as the smallest difference between methods which is likely +to indicate a significant improvement. In short, the null hypothesis —there +is a significant difference between the methods— cannot be rejected for all +methods connected by a dark bar. We also report Friedman χ2 , which is a +non-parametric statistical test that considers ranking methods over different +attempts, in our case datasets, requiring a minimum of 3 methods in comparison. +This test checks whether the measured average ranks are significantly different +from the mean rank that is expected under the null-hypothesis. +Table 3.3. In-domain (left) combined Brier and NLL proper scoring rule pairwise +comparison counts of wins/draws/losses and (right) ECE metric reported for comparing +in-domain calibration. For in-domain predictive accuracy, ensembles clearly are +superior. Considering only miscalibration, Concrete Dropout generally adds calibration +to predicted probabilities. The combination with MC Dropout gives unpredictable +ranking results. +ref +9 +12 +16 +15 +17 +11 +13 +10 +14 +0 +5 +7 +8 +6 +4 +2 +1 +3 + +Deep Ensemble +Concrete Dropout Ensemble +Heteroscedastic Concrete Dropout Ensemble +MC Heteroscedastic Ensemble +MC Heteroscedastic Concrete Dropout Ensemble +MC Ensemble +MC Concrete Dropout Ensemble +Deep Ensemble Regularized +Heteroscedastic Ensemble +Unregularized +Concrete Dropout +Heteroscedastic Concrete Dropout +MC Heteroscedastic Concrete Dropout +MC Concrete Dropout +MC Heteroscedastic +MC Dropout +Regularized +Heteroscedastic + +wins + +draws + +losses + +142 +135 +130 +114 +114 +111 +102 +90 +82 +79 +77 +70 +65 +58 +40 +39 +34 +30 + +0 +1 +4 +2 +2 +3 +0 +1 +2 +4 +1 +3 +2 +0 +5 +6 +0 +0 + +28 +34 +36 +54 +54 +56 +68 +79 +86 +87 +92 +97 +103 +112 +125 +125 +136 +140 + +ref +5 +12 +4 +8 +2 +15 +16 +7 +9 +0 +6 +11 +17 +1 +3 +14 +10 +13 + +Concrete Dropout +Concrete Dropout Ensemble +MC Heteroscedastic +MC Heteroscedastic Concrete Dropout +MC Dropout +MC Heteroscedastic Ensemble +Heteroscedastic Concrete Dropout Ensemble +Heteroscedastic Concrete Dropout +Deep Ensemble +Unregularized +MC Concrete Dropout +MC Ensemble +MC Heteroscedastic Concrete Dropout Ensemble +Regularized +Heteroscedastic +Heteroscedastic Ensemble +Deep Ensemble Regularized +MC Concrete Dropout Ensemble + +wins + +draws + +losses + +68 +58 +52 +52 +49 +48 +48 +46 +45 +40 +40 +38 +37 +32 +29 +27 +24 +23 + +1 +1 +1 +0 +2 +1 +0 +0 +1 +2 +0 +2 +1 +0 +2 +2 +2 +0 + +16 +26 +32 +33 +34 +36 +37 +39 +39 +43 +45 +45 +47 +53 +54 +56 +59 +62 + +Table 3.3 shows more detailed pairwise comparison scores, demonstrating that if +both proper scoring rules are considered, plain ensembles and hybrid methods +based on deep ensembles are overall superior to single model uncertainty +prediction methods. However, the benefit resides more in accuracy than +calibration, where some single model predictive uncertainty methods rank +higher, specifically Concrete Dropout. + + \ No newline at end of file diff --git a/assets/txts/pg_0103.txt b/assets/txts/pg_0103.txt new file mode 100644 index 0000000000000000000000000000000000000000..836e7c5bad5ee388379791d1bed12567514a28f2 --- /dev/null +++ b/assets/txts/pg_0103.txt @@ -0,0 +1,43 @@ +RESULTS + +71 + +For a most complete answer to unique effects per predictive uncertainty method, +we need to analyze dataset-specific results. Detailed results per dataset and +metrics (Appendix C.1 Fig. A.1) reconfirm that a method’s superiority (i.e., +for the whole application domain of in-domain text classification) should not +be concluded based on 1 single dataset. Each dataset has specific problem +characteristics, which affect method ranking differently at varying magnitudes. +However, the comparative performance of each method is not fully dependent +on the dataset tested, with Deep Ensemble performing reliably in-domain as +evidenced by rank. + +3.5.2 + +Experiment: Cross-domain + +This Subsection is dedicated to analyzing predictive uncertainty methods under +domain shift. We first present results on cross-domain generalization, followed +by a challenging OOD detection setting. Finally, we draw parallels between +both settings’ experimental results. +We conduct extensive experiments on the benchmark Amazon product review +datasets on a total of 12 source-target domain configurations. Each domain +is abbreviated by its first uppercase letter: (B)ooks, (D)VD, (E)lectronics, +(K)itchen. Fig. 3.4 reports on the lowest cross-domain generalization gap +between ID and OOD domain datasets. We observe higher ID accuracy for +Kitchen and Electronics, which can indicate a relatively lower complexity of +domain sentiment. Importantly, the gap between Kitchen - Electronics and +Books - DVD are smallest overall, coinciding with our intuitions on domain +similarity. Remarkably, regularized Deep Ensemble trained on Book reviews +even scores higher accuracy (+1.8%) on its target domain (B− +→D). + +Figure 3.4. Lowest accuracy generalization gap, in-domain (Acc_ID) minus out of +domain (Acc_OOD) accuracy (y-axis), of all predictive uncertainty methods per +source− +→target domain combination (x-axis). + +To analyze the cross-domain performance of predictive uncertainty methods we +report average rank ID NLL and OOD accuracy (Fig. 3.5). Heteroscedastic + + \ No newline at end of file diff --git a/assets/txts/pg_0104.txt b/assets/txts/pg_0104.txt new file mode 100644 index 0000000000000000000000000000000000000000..7eaaaad716a7af0baa13398af86f83cdbf881091 --- /dev/null +++ b/assets/txts/pg_0104.txt @@ -0,0 +1,35 @@ +72 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +ID NLL + +OOD Accuracy + +Figure 3.5. Average rank of in-domain NLL for the 4 source datasets (left) and out-ofdomain accuracy over 12 source-target configurations (right) for all tested predictive +uncertainty methods. + +Concrete Dropout Ensemble ranks highest in-domain when evaluated with a +proper scoring rule. Models without any regularization achieve higher OOD +accuracy scores, with Deep Ensemble significantly outperforming more than half +of the predictive uncertainty methods (first black bar). A possible explanation +could be that most target domain data is more similar to the source domain than +expected, effectively giving an edge to methods that achieve high ID accuracy. +To evaluate Out-of-domain detection, we report AUROC ranks in Fig. 3.6 +and additionally plot OOD detection over generalization scores in Fig. 3.7. +Concrete Dropout Ensemble and variations outrank other methods on OOD +detection. Nevertheless, we must nuance the ranking results since the magnitude +of AUROC is generally low, close to random (50-54%) with no class imbalance, +over all 12 cross-domain settings. These results might indicate that from the +perspective of the methods tested, there are no salient differences between the +different domains. More specifically, Books and DVD as a source have AUROC +scores on target OOD domain data centered around 51% and Kitchen and +Electronics as a source have comparable AUROC scores with 1 higher AUROC +(54%) cluster for OOD Books and DVD targets. + +Figure 3.6. Average rank of OOD AUROC over 12 cross-domain settings for predictive +uncertainty methods. + +Additionally, Fig. A.2 in Appendix C.1 demonstrates a similarly clear difference + + \ No newline at end of file diff --git a/assets/txts/pg_0105.txt b/assets/txts/pg_0105.txt new file mode 100644 index 0000000000000000000000000000000000000000..359a48721de4675da6a5ea7a51157c43780d72a8 --- /dev/null +++ b/assets/txts/pg_0105.txt @@ -0,0 +1,35 @@ +RESULTS + +73 + +Figure 3.7. AUROC detection magnitude (y-axis) mapped over OOD accuracy (x-axis) +with a legend on the right for methods that support uncertainty estimation. + +in correlation effect size of uncertainty quantities with ID-OOD data depending +on the target domain, e.g., high overall mean correlation (0.3) for Kitchen source +evaluated on the disparate domain of Books, whereas uncertainty correlation +on Electronics averages around 0.1 for the most correlated quantities. + +3.5.3 + +Experiment: Novelty Detection + +Before analyzing which predictive uncertainty methods provide better detection +of instances of an unseen class, we report on how uncertainty metrics (cf. +Section 3.3.3) correlate with novel class data. +In Fig. 3.8 the final rank over datasets confirms the superior robustness of +predictive entropy as an uncertainty metric. Logically, it is closely followed by +maximum softmax score. Next, model uncertainty correlates generally well with +novel class data. Interestingly, model uncertainty outperforms entropy on AAPD, +with most methods showing the need for learning from more data to better +approximate the model parameters. +Similarly to the evaluation of in-domain performance, we use CD diagrams (Fig. +3.10) with binary detection metrics AUPR and AUROC to provide a ranking +of predictive uncertainty methods over datasets. +The absolute pairwise comparisons (Table 3.9) confirm that hybrid predictive +uncertainty methods improve detection of novel class data. Quite surprisingly, +Deep Ensemble which ranked absolute highest for in-domain, drops multiple +ranks in favour of combination ensembles (Heteroscedastic Ensemble or even +MC Concrete Dropout). The in-domain calibration effect from Concrete Dropout + + \ No newline at end of file diff --git a/assets/txts/pg_0106.txt b/assets/txts/pg_0106.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f9ab5943df46a254ba4f8bd378c8ac4beee8dc0 --- /dev/null +++ b/assets/txts/pg_0106.txt @@ -0,0 +1,37 @@ +74 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +(a) 20news + +(b) CLINC150 + +(d) Reuters + +(e) AAPD + +(c) imdb + +Figure 3.8. We report the Pearson Correlation Coefficient (PCC) between uncertainty +values and binary variable ID-OOD for 5 benchmark datasets. Higher absolute +correlation score points to stronger association of uncertainty and novelty detection. +*Model Uncertainty (MU), Data Uncertainty (DU), Mutual Information (MI). + +appears to pass over to this novelty detection setting. More importantly, it also +helps boost the novelty detection performance of Deep Ensembles when jointly +used (e.g., MC Concrete Dropout Ensemble). +While comparing over 5 datasets, there is no critical difference between the +average ranking of methods, which can point to task or dataset-specific +interactions. Fig. 3.11 shows the variation of AUROC performance for the +different methods, from which we can observe that (non-finetuned) dropout +sampling (MC Dropout) under-performs in most datasets, most clearly on AAPD, +by severly underestimating uncertainty on samples of a novel class. We also +observe relative benefits of the Heteroscedastic loss function for multi-class text +classification, which most clearly is represented in the CLINC150 results. The +same visualization allows us to evaluate the quality of uncertainty quantification +for each method. Generally, epistemic uncertainty derived from ensembles +offers higher quality detection of novel class data than single model predictive +uncertainty. This effect is clearly visible for multi-class classification where the +ensembles clearly group on top, as opposed to the results for the multi-label + + \ No newline at end of file diff --git a/assets/txts/pg_0107.txt b/assets/txts/pg_0107.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4c0b827b135bd4b58a592895758a0e51e30f3fa --- /dev/null +++ b/assets/txts/pg_0107.txt @@ -0,0 +1,108 @@ +RESULTS + +75 + +ref +MC Concrete Dropout Ensemble +Heteroscedastic Ensemble +MC Concrete Dropout +MC Heteroscedastic Ensemble +Deep Ensemble Regularized +Concrete Dropout +MC Heteroscedastic Concrete Dropout Ensemble +MC Heteroscedastic Concrete Dropout +Concrete Dropout Ensemble +Regularized +Heteroscedastic +Deep Ensemble +Heteroscedastic Concrete Dropout Ensemble +MC Heteroscedastic +MC Ensemble +Unregularized +Heteroscedastic Concrete Dropout +MC Dropout + +wins + +draws + +losses + +121 +119 +109 +102 +100 +90 +89 +86 +83 +81 +80 +80 +75 +75 +71 +69 +47 +46 + +1 +1 +1 +0 +0 +1 +2 +1 +0 +1 +0 +0 +2 +0 +2 +0 +1 +1 + +48 +50 +60 +68 +70 +79 +79 +83 +87 +88 +90 +90 +93 +95 +97 +101 +122 +123 + +Figure 3.9. Novelty detection AUROC and AUPR pairwise comparison counts of +wins/draws/losses. + +Figure 3.10. Novelty detection CD diagram of AUROC. + +datasets. +Additionally, we visually detail in Appendix C.1 Fig. A.3 density estimates for +uncertainty quantities with respect to in-domain versus novel data with most +hybrid ensemble methods demonstrating better separable densities. + +3.5.4 + +Experiment: Ablations + +In this Subsection, we zoom in on the best performing uncertainty prediction +methods relative to the complementary benefits hypothesized for hybrid +approaches (Section 3.4.1), provide explanations for results specific to an +architecture (TextCNN vs. BERT, Section 3.4.3), and present ablations on +critical hyperparameters. + + \ No newline at end of file diff --git a/assets/txts/pg_0108.txt b/assets/txts/pg_0108.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c81b207423eb47ec1743a5b7d6d5092b6885d8a --- /dev/null +++ b/assets/txts/pg_0108.txt @@ -0,0 +1,32 @@ +76 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +(a) AUROC + +(b) Epistemic uncertainty + +Figure 3.11. Comparison with AUROC(↑) and Epistemic uncertainty PCC(↑) for task +and dataset-specific differences in novel class detection. Methods with 0 correlation +do not support model uncertainty quantification. + +3.5.4.1 + +Diversity + +Diversity of samples drawn from a posterior, either via T MC samples and/or +M ensemble components, is an important condition for efficient uncertainty +estimation. If each sample presents a similar function, the overall prediction +can be overconfident, and increasingly drawing samples will not reduce this. +We derive a small experimental setting from [118] to measure function-space +diversity for all predictive uncertainty methods involving posterior sampling. +In Fig. 3.12 we analyze the relation between accuracy and diversity as measured +by Kullback-Leibler +divergence between a sampled prediction and the predictive +PT +mean, T1 t=1 KL(p(y ∗ |x∗ , θ̂t )||p̄(y ∗ |x∗ , θ̂)). For a fair comparison, we calculate +diversity at the ensemble level if a predictive uncertainty method consists of +multiple models, else at the dropout sample level. +While the diversity-accuracy plane does not provide a one-on-one linear + + \ No newline at end of file diff --git a/assets/txts/pg_0109.txt b/assets/txts/pg_0109.txt new file mode 100644 index 0000000000000000000000000000000000000000..60affdc236c78b2bac1b5d15090544c519af81a4 --- /dev/null +++ b/assets/txts/pg_0109.txt @@ -0,0 +1,37 @@ +RESULTS + +77 + +(a) 20news + +(d) Reuters + +(b) CLINC150 + +(c) imdb + +(e) AAPD + +Figure 3.12. Detailed accuracy scores mapped over diversity measured by average KL +divergence for each of the benchmark datasets. + +relationship, we note in Fig. 3.12 (a,b,d) promising results for hybrid ensemble +methods, which with higher diversity improve on accuracy over Deep Ensemble. +The visual of imdb (c) registers overall low diversity, even for simple predictive +uncertainty methods which generally achieve higher diversity, albeit by capturing +multiple dissimilar yet weaker functions. For AAPD (e), most methods are tied +for exact accuracy even with different diversities. +3.5.4.2 + +NLP Architecture + +We selected specific representative predictive uncertainty methods on the +basis of our previous experiments to run with the Transformer BERT as +base architecture. We argue that the chosen architecture can have a nonnegligible impact on uncertainty estimation, and we compare with the simple +yet controllable TextCNN architecture in order to investigate whether the same +conclusions hold for novelty detection. +The separate Out-of-Scope set of CLINC150 allows us to easily evaluate novelty +detection with BERT. We observe in Fig. 3.14 on CLINC150 that BERT does +increase novelty detection over all metrics. Even without any hyperparameter + + \ No newline at end of file diff --git a/assets/txts/pg_0110.txt b/assets/txts/pg_0110.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f25806344ef7e72d75921b910c8e78e12f669bd --- /dev/null +++ b/assets/txts/pg_0110.txt @@ -0,0 +1,31 @@ +78 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +(a) ID Accuracy - imdb + +(b) ID Accuracy - Reuters + +(c) AUROC - 20news + +(d) AUROC - Reuters + +(e) Epistemics - 20news + +(f) Epistemics - imdb + +Figure 3.13. Novelty detection scores mapped per architecture for the benchmark +datasets without dedicated OOD split. The legend of Fig. 3.11 applies here. + +Figure 3.14. Detailed AUROC-epistemics (PCC) scores mapped per architecture on +CLINC150. Best performance: upper-right corner. The legend of Fig. 3.11 applies +here. + +tuning Unregularized BERT outperforms all TextCNN models. Overall, we +register the same ranking of predictive uncertainty methods, albeit a Deep +Ensemble with BERT is superior to hybrid ensembles. Crucially, we note that +the correlation of epistemic uncertainty with novelty detection is higher for each +TextCNN ensemble than for every single BERT model. +Most notably, results on all other datasets are inconsistent with the above. For + + \ No newline at end of file diff --git a/assets/txts/pg_0111.txt b/assets/txts/pg_0111.txt new file mode 100644 index 0000000000000000000000000000000000000000..607996c7119d3637f8147dbe4dfe0ace81b3c8de --- /dev/null +++ b/assets/txts/pg_0111.txt @@ -0,0 +1,38 @@ +RESULTS + +79 + +comparison, we have trained an informed sub-selection of predictive uncertainty +methods with BERT as base architecture (Fig. 3.13). +Generally, we observe in (a,b) higher ID accuracy for BERT with relatively +slighter gains when ensembling. AUROC scores (c,d) are well below even single +TextCNN models, pointing to a crucial deficiency with BERT in a novelty +detection setting. The correlation of epistemic uncertainty with novel class +samples draws a similar picture (e,f). MC Heteroscedastic Concrete Dropout +Ensemble on imdb does produce more correlated epistemic uncertainty than all +other methods. + +(a) 20news - MI + +(d) Reuters - MU + +(b) CLINC150 - AUROC + +(c) imdb - MU + +(e) AAPD - H + +Figure 3.15. Visualization of representative dataset-quantity/metric combinations +mapped over stepwise increasing ensemble size M . Note that positive and negative +correlations are corollary to the quantity reported. Given the small relative differences, +plots are best viewed online. + +3.5.4.3 + +Ensemble size M + +Combining models to an ensemble generally benefits performance both in and +out-of-domain. Previous research [118, 238] worked out that ensembling benefits +stagnate with larger model size M . Fig. 3.15 selectively reports novelty detection + + \ No newline at end of file diff --git a/assets/txts/pg_0112.txt b/assets/txts/pg_0112.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d2a5760cc843cb4f8a49d17efcf0f0cac203555 --- /dev/null +++ b/assets/txts/pg_0112.txt @@ -0,0 +1,33 @@ +80 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +metrics or uncertainty correlation scores for all ensemble-based methods of +different sizes. +AUROC score for CLINC150 (3.15b) is a representative example of the expected +effect of ensembling. Importantly, it provides crucial evidence for our general +hypothesis, demonstrating that ensembling over predictive uncertainty methods +gives complementary benefits in novelty detection settings. What is similarly +interesting is that the relative benefit of ensembling shows slightly different +curves in certain cases. Epistemic uncertainty for imdb (3.15c) already attains +similar performance at M=2, again showing comparatively slower (since less +required) increase at larger M for hybrid ensembles. AAPD (3.15e) shows more +stagnant behavior for the reliability of entropy with growing ensemble size, +irrespective of the predictive uncertainty method. +3.5.4.4 + +Concrete Dropout p + +Figure 3.16. Learned layer-wise dropout probability per layer for each method with +Concrete Dropout. The first 3 layers are the CNN kernels (K1 − 3), followed by the +penultimate layer µ, possibly with σ for modeling heteroscedasticity. The legend of +Fig. 3.17 applies here. + +Fig. 3.17 relays an important observation on the dataset-wise adaptation of +Concrete Dropout: increasing the learned dropout rate as is required for the +problem at hand. This reinforces the argument against fixed-rate dropout. +[125] remarked that practitioners started to adopt the strategy of fine-tuning +dropout with a bottleneck pattern, i.e., start with a higher dropout rate in early +layers and decrease the deeper you go in the network. Our results (Fig. 3.16) + + \ No newline at end of file diff --git a/assets/txts/pg_0113.txt b/assets/txts/pg_0113.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb4e7a903c715650908e86bcbe9bb23048b0d38c --- /dev/null +++ b/assets/txts/pg_0113.txt @@ -0,0 +1,33 @@ +DISCUSSION + +81 + +shows discrepancy with this practice, specifically for 20news and CLINC150. We +do note that both converged to low dropout rates, which can provide the basis +for this differing behavior. + +Figure 3.17. Top: Average epoch of convergence per dataset. Bottom: Average learned +Concrete Dropout probability per dataset over predictive uncertainty methods. We +observe very dataset-dependent dropout rates. + +3.6 + +Discussion + +Our study investigates both scalable and hybrid procedures for incorporating +uncertainty into DL models for text classification. Next to baseline in-domain +uncertainty evaluation, we have designed two experimental settings, novelty +detection and cross-domain classification, to analyze the reliability of uncertainty. +Additionally, we devised ablation studies to analyze important hyperparameters +in connection to our three hypotheses (Section 3.4.1) on complementary benefits +for hybrid uncertainty prediction methods. +Benchmarking uncertainty methods We summarize our findings succinctly and discuss the results of each experimental setting. +We find that individually (> indicating “outperforms” over all experiment +settings): +Deep Ensemble > Concrete Dropout > (MC) Heteroscedastic ≥ MC Dropout +We find that jointly, by considering method combinations: +(MC) Concrete Dropout Ensemble ≥ (MC) Heteroscedastic Ensemble > +MC Concrete Dropout > Deep Ensemble > Deep Ensemble Regularized > +MC Dropout + + \ No newline at end of file diff --git a/assets/txts/pg_0114.txt b/assets/txts/pg_0114.txt new file mode 100644 index 0000000000000000000000000000000000000000..055fb52976534f236915c90070cafc9f3802df78 --- /dev/null +++ b/assets/txts/pg_0114.txt @@ -0,0 +1,42 @@ +82 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +In-domain results (Section 3.5.1) corroborate the superiority of Deep Ensemble +with high accuracy and proper scores (NLL, Brier). Table 3.3 demonstrates +that the improvements come from accuracy as opposed to calibration, where +Concrete Dropout-based methods rule. +Cross-domain experiments (Section 3.5.2) give differing conclusions: crossdomain generalization results are similar to in-domain, whereas out-of-domain +detection follows novelty detection results. Our evaluation of uncertainty +quantities (Fig. A.2) demonstrate reliably higher correlation of uncertainty +with domain discrepancy. We do take note of relatively low magnitude AUROC +(Fig. 3.6), which underlines how challenging out-of-domain detection is in a +domain adaptation setting with comparably similar linguistic patterns. +Novelty detection (Section 3.5.3) in text classification gives reverse results: +Hybrid ensemble methods with Concrete Dropout rank highest scored by +AUROC, AUPR and model uncertainty correlation, followed by other method +combinations that induce calibration. We do note that specific method +performance is often tied to task and dataset characteristics, with results +averaged over the 5 benchmark sets showing statistically non-significant +differences between methods. As shown in Table 3.9, standard Deep Ensemble, +i.e., without any regularization or prior from combining methods, perform worse +outside the in-domain setting. The case for standard MC Dropout is even worse +with novel class robustness (AUROC and AUPR) lower than the Unregularized +point-estimate model. +Remarkably, BERT performs worse than the simpler TextCNN model at +detecting distribution shift in the form of novel class data (Fig. 3.14). Results on +the OOS set of CLINC150 differ from results obtained on all other datasets, which +we believe can be attributed to the short, in-domain intent commands differing +strongly in vocabulary with the OOS samples, resulting in a comparatively less +challenging novelty detection setting. We contend that novelty detection is +actually more challenging for BERT despite of its pretrained language modeling +knowledge and because of the strict requirement to fine-tune the task-specific +final layer with new supervision. Its ability to detect (and overly rely on, e.g., +[162]) statistically relevant yet possibly spurious cues in language data will make +it overconfident with transfer to a new task when the i.i.d. assumption cannot +be maintained. +Validating hybrid approaches We have empirically analyzed individualjoint effectiveness in modeling predictive uncertainty and will answer our three +hypotheses on complementary benefits from combining inter and intra-modal +posterior approximation. + + \ No newline at end of file diff --git a/assets/txts/pg_0115.txt b/assets/txts/pg_0115.txt new file mode 100644 index 0000000000000000000000000000000000000000..1440627302f4a2dcfd5000a5774b80b175cfe38a --- /dev/null +++ b/assets/txts/pg_0115.txt @@ -0,0 +1,46 @@ +DISCUSSION + +83 + +Firstly [A], ensembling (increasing M ) proves to give relatively higher +performance benefits than stochastically sampling predictions from an optimized +solution (T ). The effect is clearest in the in-domain setting (Table. 3.3) +and is less pronounced in the out-of-domain settings. For a given predictive +uncertainty method, we cannot provide solid evidence that uncertainty reliability +always improves when subspace sampling (increasing T , “MC”). AUROC and +AUPR rankings (Figs. 3.10 and 3.6) present evidence in favour, although Fig. +3.11 depicts a more fine-grained comparison over datasets and uncertainty +methods. Our analysis of diversity (Fig. 3.12) shows promising results for +hybrid ensemble methods, which exhibit higher diversity in posterior samples +resulting in improved accuracy. +Secondly [B], our newly proposed hybrid uncertainty estimation methods +improve effectively over singular methods, both in novelty detection (Table +3.9 and Figs. 3.10, 3.11) and out-of-domain detection (Fig. 3.6). Additionally, +in ablation studies we find (Fig. 3.15) that combining predictive uncertainty +methods in an ensemble attains higher performance with a lower number of +models (M < 5) compared to a Deep Ensemble (M = 5). +Thirdly [C], Table 3.3 demonstrates that MC Concrete Dropout improves over +MC Dropout (p=0.5) on ECE and proper scoring functions. The out-of-domain +experiments (detail: Fig. 3.11) similarly show that not fine-tuning dropout +to the dataset and task at hand is detrimental even when combining models +into an ensemble (e.g., MC Ensemble vs. MC Concrete Dropout Ensemble). +Ablation on Concrete Dropout (Fig. 3.17) points to very dataset-dependent +learned probability rates, which vary strongly layer-wise (Fig. 3.16). We +link the empirical superiority of MC Concrete Dropout Ensemble to balanced +posterior collapse, thanks to the VI-based optimization of the dropout prior. +We tentatively claim that the former provides constrained hypothesis support +and a more fine-tuned influence of prior. +Benchmark comparison When comparing our results to existing BDL +benchmarks, most observations are consistent for in-domain and out-of-domain +performance. +Our in-domain results are most similar to [348], where Deep Ensemble +outperforms most methods, —albeit in their survey they did not compare +combinations of predictive uncertainty—, in our benchmark closely followed +by hybrid ensemble methods. When evaluating over various data retention +rates [113] observed that “an ensemble of MC Dropout models” (our MC +Ensemble) consistently outperforms all other methods. This survey offers the +closest point of comparison, although our experimental settings vary. While +we cannot directly compare cross-domain detection with other benchmarks, we +argue that our cross-domain classification setting mimics their low data regime + + \ No newline at end of file diff --git a/assets/txts/pg_0116.txt b/assets/txts/pg_0116.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e2c939bd0eb6bdd2650d0377a96ed199e1afeae --- /dev/null +++ b/assets/txts/pg_0116.txt @@ -0,0 +1,44 @@ +84 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +experiments. +Across different modalities and tasks, Deep Ensemble has been reported to +consistently outperform VI-based methods, most specifically MC Dropout, +with/without distribution shift (image classification [348], molecule prediction +[409], and pendulum physics [56]). However, for a binary image classification +problem, [113] report higher accuracy for MC Dropout compared to Deep +Ensemble, whereas our results suggest that MC Dropout can induce positive +calibration, yet score lower on accuracy and with proper scoring rules. In +their experiments they use a fixed dropout rate of 0.2 and fine-tuned weight +decay rate, making them fitting for their task at hand and explaining possibly +optimistic results. Another uncertainty quantification benchmark [462] reports +strong results on image classification for various Monte Carlo methods, although +we cannot make a direct comparison. For further discussion, we refer the reader +to Appendix 3.7.1. +Our results suggest that BERT performs worse in a novelty detection setting, +whereas [174] concludes that Transformers are considerably more robust when +compared across domains, e.g., detection of news samples with a sentiment +classifier. We point out below that both settings are in fact incomparable. We +evaluate detection on novel samples which have alike vocabulary characteristics +to the source domain albeit they are excluded from training supervision. Their +setting evaluates detection between very disparate domains where linguistic +patterns are significantly different and BERT will most probably fallback to +its pretrained knowledge for detection. In short, we do believe that pretrained +Transformers could perform better under varying distribution shifts, yet with +our results underpinning the exception of novel class detection. More research +is needed into how the inductive bias from given NN architectures influences +approximate inference. +Take-homes For predictive uncertainty in text classification, we derive a +number of take-homes from the benchmarking evidence, centered around +practical facets to consider for applications. +One has to consider (i) ease and cost of implementation, (ii) computational +and memory complexity, comprising training compute, test compute and +storage/memory constraints, (iii) the degree of fine-tuning required, (iv) type +of supervision; multi-class with low/high number of classes (K) or multi-label +with low/high cardinality (C), (v) expectation of distribution shift; in the form +of novel class data or unseen language patterns, and (vi) support for uncertainty +quantification by source. +For a prototypical low K multi-class text classification task, we advise Deep + + \ No newline at end of file diff --git a/assets/txts/pg_0117.txt b/assets/txts/pg_0117.txt new file mode 100644 index 0000000000000000000000000000000000000000..92b8aefe4228a82875da7de2b5d024f51d725e5e --- /dev/null +++ b/assets/txts/pg_0117.txt @@ -0,0 +1,46 @@ +ADDITIONAL UNCERTAINTY APPROACHES + +85 + +Ensemble for solid in-domain performance and adequate distribution shift +robustness. In the case of memory or storage constraints, for example if your +base model already has high complexity, using (MC) Concrete Dropout will +provide calibration benefits both in and out-of-domain, albeit at a slightly larger +implementation cost. Similarly, to constrain computational complexity, it can +be more sensible to rely on a TextCNN ensemble (5*6M parameters) rather +than BERT (110M parameters). Considering time complexity, we have added +detailed compute, time and storage statistics for evaluated methods (Appendix +Appendix B.2). We would advise against using MC Dropout if the dropout +rate and weight regularization are not fine-tuned for the problem at hand. Our +benchmarking experiments demonstrate the unpredictable behavior of fixed-rate +MC Dropout, compared to Concrete Dropout, which we used as a proxy for +models with fine-tuned dropout ratio. This (mal)practice should be highlighted +as it has substantial impact on uncertainty estimation and robustness. +If K starts to increase, it warrants the effort to implement the Heteroscedastic +loss function, which will make the model more calibrated in-domain. Additionally, it enables data uncertainty estimation for possible noisy ground truths, +which can happen more frequently with a larger number of classes. +If C grows larger, reliable epistemic uncertainty estimation becomes more +important, since the problem is made more complex given the larger number of +label combinations. Our evidence is slightly contradicting, with results obtained +on Reuters suggesting MC Concrete Dropout Ensemble and on AAPD warranting +Deep Ensemble. What should be clear, is that any form of ensembling is valuable +in multi-label classification to boost performance. +Under the expectation of distribution shift in the form of novel class data, +adding Concrete Dropout with stochastic sampling to an ensemble, MC Concrete +Dropout Ensemble, gives relatively strong benefits compared to a regular Deep +Ensemble. Ablations also show that less models (M ) would be required to +reach similar performance. Generally, in-domain calibration inducing methods +are more robust when applied in the tested out-of-domain settings. For the +in-domain setting, the incorporation of data uncertainty incrementally improves +multi-class text classification. Ablation on NLP architectures (Section 3.5.4.2) +points to a deficiency of BERT for detecting novel class data and would similarly +be advised against in favour of simpler text classification architectures. + +3.7 + +Additional Uncertainty Approaches + +Next to the method combinations benchmarked in the main work, we +acknowledge two alternative approaches to uncertainty estimation with appealing + + \ No newline at end of file diff --git a/assets/txts/pg_0118.txt b/assets/txts/pg_0118.txt new file mode 100644 index 0000000000000000000000000000000000000000..c97416ae2a0f9c11e5b9c517674bee1a1015cda3 --- /dev/null +++ b/assets/txts/pg_0118.txt @@ -0,0 +1,46 @@ +86 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +properties such as training scalability and cheaper inference. + +3.7.1 + +Stochastic Gradient MCMC Methods + +There exists a wide range of sampling-based inference methods in the stochastic +gradient MCMC (SG-MCMC) literature, which have become increasingly more +tractable and empirically successful for uncertainty estimation. Specifically, +we re-implemented an exemplary approach [530], cyclical SG-MCMC (cSGMCMC), which uses a cosine cyclical learning rate schedule [292] to (i) better +explore the highly multimodal loss landscape and (ii) sample more efficiently +from the posterior. While this appealing approach reduces computational +complexity by only training a single model, we experienced that it is very tricky +to finetune with many hyperparameters interplaying. Instead of benchmarking +these methods and reporting scores over ranges of hyperparameters, we provide +a discussion of the perceived gap in theory and practice for this family of +uncertainty methods. +While the stochastic MCMC setting, estimating parameter updates from +minibatches, is computationally convenient, it induces several theoretical +challenges: i) minibatch noise introduced from small subsets of data [297], +ii) omission of the Metropolis-Hastings correction step provides fundamentally +biased estimates of posterior expectations [192], and iii) the suggested practice of +temperature tempering implies an approximation to the exact posterior instead +of proper convergence [122, 491]. +Closer to practice, [530]’s methods have been successfully benchmarked [462, 491] +with reported performance on OOD detection for image classification datasets +comparable to or better than Deep Ensembles. An important caveat is that all +hyperparameters have been meticulously finetuned to the task at hand. This is +non-trivial given the additional specification of the number of cycles as guided +by a training budget, proportion of burn-in steps, and finding an appropriately +tempered posterior. The original work [530] mentions little dependence of results +on these modifications to the optimization procedure, yet we observed similar +to [122] “the complexity and fragility of hyper-parameter tuning, including the +learning rate schedule and those that govern the simulation of a second-order +Langevin dynamics”. Additionally, making combinations of uncertainty methods +with cSG-MCMC is non-trivial, since regularization in any form influences the +large scale curvature of the regions the optimizer explores. +With regards to re-implementation, we experienced issues with the indexing +of sparse gradient updates for the embedding lookup, an operation pervasive +in NLP architectures. Our original baseline models were trained with Adam + + \ No newline at end of file diff --git a/assets/txts/pg_0119.txt b/assets/txts/pg_0119.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e77edfcf590ffc35651e001106fe28ae2857467 --- /dev/null +++ b/assets/txts/pg_0119.txt @@ -0,0 +1,47 @@ +ADDITIONAL UNCERTAINTY APPROACHES + +87 + +optimizer, which consistently outscored any of our cSG-MCMC experiments +built upon SGD modifications. +There is an unmistakable complexity with how to sample appropriately from the +true posterior, as we now rely much on the training data, a “weak” regularizer, +on how to add noise for parameter space exploration. Concurrently, the +overparametrized regime is becoming commonplace in DL, especially in NLP +with the advent of Transformers, which calls for more sensible priors for more +than millions of parameters [453] and a better understanding of how output +functions are affected [107]. We believe stronger priors are available, not only +over parameters P (θ) but rather over functions P (fθ (x)) as specified by the +choice of architecture [192], which can make this family of methods an even +more competitive challenger. + +3.7.2 + +Spectral-normalized Neural Gaussian Process + +[283] propose with Spectral-normalized Neural Gaussian Process (SNGP) a +principled, scalable approach to uncertainty estimation for deep NNs. They +promote “distance awareness” as a necessary condition, which they accomplish +via spectral weight normalization and a GP output layer. Thanks to the +mean-field approximation [295] only a single forward pass suffices without MC +sampling to estimate the predictive distribution. Empirically, SNGP was shown +to outperform Deep Ensemble by some margin on OOD detection for both +image and text data. By demonstrating the relative importance of the decision +boundary of a single model fθ (y|x) versus averaging over multiple models, we +are inspired to analyze the combination of SNGP with alternate uncertainty +methods. +We have re-implemented SNGP using components of edward2 [454], Laplace +approximation, random feature GP and spectral normalization. In our +experience, the most crucial hyperparameters to finetune were the number +of inducing points (ι ≤ 1024) and spectral norm multiplier s. For the latter, we +follow the recommended tuning procedure to find an appropriate value in the +range {1, 2, 5, (10, 15)}, where we heuristically increased the search space. +For simplicity and computational reasons, we use TextCNN as base architecture. +However, in order to correctly apply spectral normalization to convolutional +filters [151], we had to re-implement TextCNN(v2) with 2D convolutions and +maxpooling. This in turn requires specifying a fixed sequence length in advance, +which invalidates directly comparing to the experiment results of Section 3.5. We +additionally re-train base models with TextCNN(v2) and combine SNGP with +our Regularized baseline (Reg), with MC Dropout (MCD), Concrete Dropout + + \ No newline at end of file diff --git a/assets/txts/pg_0120.txt b/assets/txts/pg_0120.txt new file mode 100644 index 0000000000000000000000000000000000000000..59e8081a1210a35c666716bb6a59cdf9f90985cf --- /dev/null +++ b/assets/txts/pg_0120.txt @@ -0,0 +1,32 @@ +88 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +(CD) and Ensemble (Ens). For SNGP ensembles, we empirically selected s = 15 +for the base model. +3.7.2.1 + +SNGP Results + +First, we present critical difference analyses for in-domain classification (Fig. +3.18) and novelty detection (Fig. 3.19). Ensembling SNGP models, Deep +Ensemble SNGP, proves superior in-domain, followed by Concrete Dropout +Ensemble with and without SNGP. For novely detection, (MC) Deep Ensemble +is most successful with small differences between next high-ranked methods. +To our surprise, SNGP ranks quite low on the text classification tasks, although +in the original work it demonstrated OOD detection superior to Deep Ensemble. +In what follows, we analyze the novelty detection ranking of SNGP, specifically +per dataset and for multiple values of s. + +Figure 3.18. CD diagram of NLL for base and SNGP method combinations with a +TextCNNv2 backbone. + +Figure 3.19. CD diagram of AUROC for base and SNGP method combinations with +a TextCNNv2 backbone. + +In order to zoom in on the relative ranking of SNGP (combination) methods, +we plot in Fig. 3.20 AUROC detection scores for datasets with interesting trend +changes. Overall, SNGP underperforms on CLINC-OOS, with the exception of +Deep Ensemble SNGP. For 20news, SNGP and Deep Ensemble SNGP rank + + \ No newline at end of file diff --git a/assets/txts/pg_0121.txt b/assets/txts/pg_0121.txt new file mode 100644 index 0000000000000000000000000000000000000000..14eae1b0607c4905d5ce46ff6a8244bf2ac66d3f --- /dev/null +++ b/assets/txts/pg_0121.txt @@ -0,0 +1,22 @@ +ADDITIONAL UNCERTAINTY APPROACHES + +89 + +high, although any additional regularization with SNGP worsens detection, +even as ensemble. For Reuters, we observe the exact opposite to 20news, with +SNGP reporting high detection scores only when regularization is added, e.g. +Regularized SNGP. Remarkably, this trend is reversed for the base model, with +Unregularized scoring particularly good. + +Figure 3.20. AUROC scores over unique (abbreviated) methods per dataset. Error +bars are computed over multiple runs (5 seeds) for non-ensembles. + +Finally, Fig. 3.21 reports on how novelty detection varies for different values +of the spectral normalization multiplier s. As the trend lines indicate, larger +values of s generally improve novelty detection, although AUROC varies more +(larger shading) between methods and datasets. This observation prompts us +to investigate the optimality of s per dataset. The right subplot shows that +spectral norm multipliers are very dataset-dependent and that searching further +than the originally suggested range can give great performance boosts. + + \ No newline at end of file diff --git a/assets/txts/pg_0122.txt b/assets/txts/pg_0122.txt new file mode 100644 index 0000000000000000000000000000000000000000..16d7ebe0d0b248708d8c52e7172f922db3c892db --- /dev/null +++ b/assets/txts/pg_0122.txt @@ -0,0 +1,38 @@ +90 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +Figure 3.21. Left: AUROC scores (y-axis) over all datasets with unique runs plotted +for base (s = 0) and SNGP TextCNNv2 models with varying spectral normalization +multipliers (x-axis). Lines with shading indicate the trend observed between AUROC +and s. Right: AUROC mean and stddev over runs, sampling and datasets. + +3.7.2.2 + +SNGP Discussion + +While SNGP was reported to outperform Deep Ensemble in the original CLINC +OOD detection experiments [283], our results do not deliver the same ranking. +While investigating the interaction of SNGP with different uncertainty methods, +we observe the nontrivial role of spectral normalization, specifically setting the +norm multiplier s to an appropriate value. Additionally, we contribute the +analysis of the interplay with additional regularization mechanisms, which +was missing in the literature. The original work mentions that given an +approximation with the power iteration method, there is not a precise control +of the true spectral norm. Whereas spectral normalization keeps the magnitude +of updates to weights in check, Dropout regularization and weight decay may +rescale layers’ spectral norm in unexpected ways. We hope our experimentation +demonstrates the need for deeper understanding of how to combine multiple +regularization mechanisms and maintain a good spectral norm approximation +for effective posterior approximation. + +3.8 + +Limitations + +As with the majority of benchmarking literature in Bayesian Deep Learning, +the design of the current study is subject to limitations. +The first limitation concerns selection bias for text classification datasets. We +benchmark 6 prototypical text classification datasets covering binary, multiclass, and multi-label classification by topic, sentiment and intent. The task + + \ No newline at end of file diff --git a/assets/txts/pg_0123.txt b/assets/txts/pg_0123.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0da6c8837ec352152d8b5ff575ddbbc602da1b2 --- /dev/null +++ b/assets/txts/pg_0123.txt @@ -0,0 +1,45 @@ +CHAPTER CONCLUSION + +91 + +domain of text classification is very large with additionally interesting variations +of (i) short social media or long business document text, (ii) hierarchical or +extreme multi-label text classification, and (iii) challenging task settings such +as fake news detection or reading comprehension. Since these present open subproblems in text classification we did not consider them for our benchmarking +study, yet encourage analysis for future research. +The second limitation is related to the representativeness of uncertainty +quantification methods. We specifically opted for scalable procedures which +have been increasingly gaining attention by practitioners. In total we derive 18 +method combinations from two competing predictive uncertainty procedures, for +which we already resort to statistical summaries and rank-based evaluation to +present results. Due to computational constraints, retraining min. 5 ensembles +of size M = 5 per dataset and per experiment setup, we did not consider a +natural Bayesian extension of Deep Ensemble, Bayesian Ensemble [360] where +all weight initialization is shared around a single prior. Additionally, 3.7 includes +preliminary experiments with two new uncertainty approaches, cyclical SGMCMC [530] and SNGP [283], which are less practical to benchmark, but bring +promising ideas for improved, high-quality uncertainty estimation. +Finally, evaluating the quality of uncertainty quantification is an open problem +in BDL, typically approached with proxy setups, as is the case in our benchmark +with a focus on novelty detection and cross-domain generalization. Section 3.3.5 +presents a nuanced view of this evaluation practice. In addition, evaluating +reliable uncertainty estimation in NLP as opposed to other modalities is +complicated due to the discrete nature of language. Ideally, we would have +extended our benchmark with more probing setups covering situations where +we expect predictive uncertainty to be crucial, for instance, when dealing with +noisy supervision/inputs or low data regimes. + +3.9 + +Chapter Conclusion + +In general, while seeking to optimize for a well-approximated (whether or not +Bayesian) posterior, current predictive uncertainty methods are imperfect and +very often practically not useful. However, the need for practical and scalable +solutions to both incorporating and evaluating the quality of uncertainty is +huge, as it is a prerequisite to reliable automation. Uncertainty quantification +requires modality to task-specific benchmarking to help practitioners safely rely +on them and inform researchers to prioritize the right approaches. +In this work, we have presented empirical evidence from benchmarking +uncertainty methods in text classification, contributing and calling attention + + \ No newline at end of file diff --git a/assets/txts/pg_0124.txt b/assets/txts/pg_0124.txt new file mode 100644 index 0000000000000000000000000000000000000000..daf45b2434af04f1ea5f04090cf8d298f476808c --- /dev/null +++ b/assets/txts/pg_0124.txt @@ -0,0 +1,43 @@ +92 + +BENCHMARKING SCALABLE PREDICTIVE UNCERTAINTY IN TEXT CLASSIFICATION + +to the under-explored study of uncertainty quality and model robustness in +realistic NLP data distributions. +Interestingly, we find that general behavior of predictive uncertainty methods +does not hold over different datasets, with method performance often tied to the +text classification task. Overall, we cannot discern a clear winning predictive +uncertainty procedure, yet some methods clearly perform worse. Although a +universal methodology is absent, we observe that there are specific correlations +between a method’s performance and the problem setting representing text +classification task characteristics, for which we have formulated practical takehomes. +An important contribution is the proposed novel combinations of predictive +uncertainty methods. Our benchmarking experiments have revealed MC +Concrete Dropout Ensemble to be overall superior at novel class and out-ofdomain detection in text classification, even with a lower ensemble size. Most +notably, it outperforms Deep Ensemble which has leading performance in recent +BDL surveys on image data. We linked complementary benefits of hybrid +uncertainty estimation methods to ongoing research on NN diversity in functionspace and have provided more evidence in support of hybrid approaches. We +have determined in an ablation study that M , ensemble size, T , number of Monte +Carlo samples, and p, dropout probability rate, are crucial hyperparameters +to take into consideration for improved robustness and uncertainty estimation. +Finally, we experimentally validated predictive uncertainty methods on realworld text classification tasks, including multi-label targets, coupling our +hypotheses and results to the NLP problem space. Crucially, we found an +important deficiency of BERT, compared to a more simple NLP architecture +TextCNN, with respect to novel class robustness, limiting the applicability +of transfer learning from pretrained Transformers under the expectation of +uncertainty and novel class instances. +To further improve calibration and robustness in the text classification domain, +and by extension uncertainty in NLP, we need to better understand what +will make existing or novel uncertainty estimation techniques successful. This +requires the development of well-motivated tooling and protocols to reliably +assess the quality and fidelity of posterior approximation. Generally, the role +of priors in increasingly larger models deserves more attention. While our +work focused on posterior geometry and weight-based priors in the form of +regularization, stronger, more meaningful functional priors exist, which should +be exploited to encourage desirable predictive behavior such as robustness +to specific distribution shifts. Particularly for NLP, more focused research +is required into what aspects —language data characteristics, inherent task +difficulty or ambiguity, architecture design, learned representations, objectives, +and effective parameter usage— render NLP pipelines more complex to imbue + + \ No newline at end of file diff --git a/assets/txts/pg_0125.txt b/assets/txts/pg_0125.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2b711a17ada98f9f68645c5cd00ee615463d426 --- /dev/null +++ b/assets/txts/pg_0125.txt @@ -0,0 +1,7 @@ +CHAPTER CONCLUSION + +93 + +with reliable uncertainty and guarantee future out-of-distribution robustness. + + \ No newline at end of file diff --git a/assets/txts/pg_0126.txt b/assets/txts/pg_0126.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ee7d4f9025024a183acd100746c1973805cd07d --- /dev/null +++ b/assets/txts/pg_0126.txt @@ -0,0 +1,8 @@ +Part II + +Realistic and Efficient +Document Understanding + +94 + + \ No newline at end of file diff --git a/assets/txts/pg_0127.txt b/assets/txts/pg_0127.txt new file mode 100644 index 0000000000000000000000000000000000000000..6429fa5d07f6165fee2bd9b536b89b9391adf738 --- /dev/null +++ b/assets/txts/pg_0127.txt @@ -0,0 +1,27 @@ +Chapter 4 + +Beyond Document Page +Classification: Design, +Datasets, and Challenges +The contents of this chapter comes from a publication [470] that was presented +53 +as an oral presentation at WACV 2024 ( 2042 +≈ 2.5%): +Jordy Van Landeghem, Sanket Biswas, Matthew Blaschko, and Marie-Francine +Moens. Beyond Document Page Classification: Design, Datasets, and Challenges. +In Proceedings of the IEEE/CVF Winter Conference on Applications of +Computer Vision, pages 2962–2972, 2024 +Disclosing the work done: +I conceptualized the work, implemented the experiments, and wrote the +manuscript. Sanket Biswas helped with related work and polishing the writing, +and we acknowledge help in data collection from Ruben Perez Tito and Stefan +Larson. +This chapter focuses on moving beyond the (self-imposed) restrictions of page +limits, and exploring the full potential of DL for document processing. A major +highlight is the need to bring document classification benchmarking closer to +real-world applications, both in the nature of data tested (X: multi-channel, +multipaged, multi-industry; Y : class distributions and label set variety) and + +95 + + \ No newline at end of file diff --git a/assets/txts/pg_0128.txt b/assets/txts/pg_0128.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b6c3f776dea3287f1c57756a69e80d909d1fc44 --- /dev/null +++ b/assets/txts/pg_0128.txt @@ -0,0 +1,31 @@ +96 + +BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES + +in classification tasks considered (f : multipage document, page stream, and +document bundle classification, ...). We start by introducing the problem of +document classification (DC) and its importance in the larger scope of document +understanding, for which we emphasize visually-rich documents, adopting the +acronym VDU instead. Moreover, we identify the lack of public multipage +document classification datasets, formalize different classification tasks arising +in application scenarios, and motivate the value of targeting efficient multipage +document representations. +An experimental study on proposed multipage document classification datasets +demonstrates that current benchmarks have become irrelevant and need to be +updated to evaluate complete documents, as they naturally occur in practice. +This reality check also calls for more mature evaluation methodologies, covering +calibration evaluation, inference complexity (time-memory), and a range of +realistic distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). This chapter ends on a hopeful note by recommending concrete avenues +for future improvements, pertaining to document dataset construction efforts +and suggested methodologies. +The work in this chapter was the trigger for the next chapter (Chapter 5), +in which we propose a new, comprehensive DU benchmark, DUDE, that is +more aligned with real-world applications and practices, naturally including +multipage documents that satisfy many of this chapter’s recommendations. + +Figure 4.1. Overview of different classification tasks that can be found in real-world +VDU applications, that are not sufficiently addressed in DC research. The classification +task notation and definitions are introduced in Section 4.2. + + \ No newline at end of file diff --git a/assets/txts/pg_0129.txt b/assets/txts/pg_0129.txt new file mode 100644 index 0000000000000000000000000000000000000000..79ffc990dda0f3d5ffad4ee414c8a873eb506129 --- /dev/null +++ b/assets/txts/pg_0129.txt @@ -0,0 +1,47 @@ +INTRODUCTION + +4.1 + +97 + +Introduction + +Visual Document Understanding (VDU) comprises a large set of skills, including +the ability to holistically process both textual and visual components structured +according to rich semantic layouts. The majority of efforts are directed toward +the application-directed tasks of classification and extraction of key information +(KIE) in visually-rich documents (VRDs). Document classification (DC) is +a fundamental step in any industrial VDU pipeline as it assigns a semantically +meaningful category, routes a document for further processing (towards KIE, +fraud checking), or flags incomplete (e.g., missing scans) or irrelevant documents +(e.g., recipe cookbook in a loan application). +Documents are intrinsically multipaged, explaining (partly) why PDF is one of +the most popular universal document file formats.1 While DC in information +management workflows typically involves multipage VRDs, current public +datasets [165, 233] only support single-page images and constitute too simplified +benchmarks for evaluating fundamental progress in DC. +With the advent of deep learning, the VDU field has shifted from region-based +analysis to whole-page image analysis. This shift led to substantial improvements +in processing document images with more complex layout variability, exposing +the limitations of template-based methods. Our work highlights the opportunity +and necessity of moving beyond the page limits toward evaluation on complete +document inputs, as they prevalently occur (multipage documents, bundles, +page streams, and splits) across various practical scenarios within real-world DC +applications, demonstrated in Figure 4.1. +The practical task of long document classification [372] is largely underexplored +due to challenges in computation and how to efficiently represent large +multimodal inputs. Additionally, the proximity to applications involves a +larger community for conducting research, yet innovations may happen in +isolation or are kept back as intellectual property, lacking evaluation on +public benchmarks [147, 148], consequently hindering reproducibility and fair +comparisons. +Existing DC methodology is limited to single-page images, and independently and +identically distributed (i.i.d.) settings. We propose an improved methodology +that extends its scope to multipage images and non-i.i.d. settings. We also reflect +on evaluation practices and put forward more mature evaluation protocols. To +better capture the complexity of real-world document handling, we align DC +benchmarking closer to practical applications and task formulations. +1 PDF is the 2nd most popular file format on the web (after HTML and XHTML) following +detected MIME types in CommonCrawl. + + \ No newline at end of file diff --git a/assets/txts/pg_0130.txt b/assets/txts/pg_0130.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cbb1f0f857ae2e1f841753c08609a21bef67f92 --- /dev/null +++ b/assets/txts/pg_0130.txt @@ -0,0 +1,53 @@ +98 + +BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES + +Our key contributions can be summarized as follows: +• We have redesigned and formalized multipage DC scenarios to align +fragmented definitions and practices. +• We construct and share two novel datasets RVL-CDIP_MP2 and +RVL-CDIP-N_MP3 to the community for evaluating multipage DC. +• We conduct a comprehensive analysis of the novel datasets with different +experimental strategies, observing the promise from best-case analysis +(+6% absolute accuracy) by targeting multipage document representations +and inference. +• We overview challenges stalling DC progress, giving concrete guidelines to +improve and increase dataset construction efforts. + +4.2 + +Problem Formulation + +We propose to use formal definitions to better align DC research with real-world +document distributions and practices. This will help to standardize DC practices +and make it easier to compare different methods. +Let X denote a space of documents, and let Y denote the output space as a finite +set of discrete labels. Document page classification is a prototypical instance of +classification [472], where the goal is to learn an estimator f : X → Y using N +supervised input-output pairs (X, Y ) ∈ X × Y drawn i.i.d. from an unknown +joint distribution P (X, Y ). +A page p is a natural classification input that consists of an image v ∈ RQ×H×W +T +(number of channels, height, and width, respectively) with T word tokens {ti }i=1 + 1 1 2 2 T +organized according to a layout structure xi , yi , xi , yi i=1 , typically referred +to as bounding boxes, either coming from Optical Character Recognition (OCR) +or natively encoded. +Note that in practical business settings, VRDs are presented at inference time +to a production VDU system in different forms: +I. +II. +III. +IV. +V. + +Single page (often scanned or photographed) +Single document +Multiple documents +Multiple pages (often bulk-scanned to a single PDF) +Single image with multiple localized pages + +2 huggingface.co/datasets/bdpc/rvl_cdip_mp +3 huggingface.co/datasets/bdpc/rvl_cdip_n_mp + + \ No newline at end of file diff --git a/assets/txts/pg_0131.txt b/assets/txts/pg_0131.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0d38bbc0d71cda8991ea05d366063879d51b31c --- /dev/null +++ b/assets/txts/pg_0131.txt @@ -0,0 +1,47 @@ +PROBLEM FORMULATION + +99 + +Classification tasks +In a unification attempt, we formalize the different +classification inputs and tasks that arise in practical scenarios, as visualized in +Figure 4.1. +Definition 9 [Page Classification]. (I) A page (as defined above) is +categorized with a single category. When only considering the visual modality, +the literature refers to it as ‘document image classification’ [165]. An estimator +for page classification with the input dimensionality (Xp ) relative to a page +(viz., number of channels, height, and width) is defined as: +fp : Xp → Y, +where Y = [C] for C mutually exclusive categories. + +(4.1) + +Definition 10 [Document Classification]. (II) A document d contains a +fixed number of L ∈ [1, ∞) pages, which do not necessarily have the same +dimensions (height and width). Albeit a design choice, the input dimensionality +is normalized across pages (e.g., 3 × 224 × 224). Assuming a fixed input +dimensionality (Xd ) relative to a document (L × Q × H × W ), a document +classifier is defined as: +fd : Xd → Y, +where Y = [K] for K mutually exclusive categories. + +(4.2) + +Note also the difference in label space between the two previous classification +tasks, which can have some overlap for document types that are uniquely +identifiable from a single page (e.g., an accident statement form). +Definition 11 [Document Bundle Classification]. (III) A bundle b can +contain a variable number of B documents, each with a potentially different +amount of L pages. A bundle classifier models a sequence classification problem +over multiple documents: +fb : Xb → Y, where Y is a product space of B documents, +Y = Y1 × ... × YB , with {Yj = [K] : j ∈ [B]}. + +(4.3) + +Definition 12 [Document Stream Classification]. (IV) A page stream s +is similar to a document in terms of input (number of pages L), albeit typically +more varied in content and page formats. Page streams can implicitly contain +many different documents, with pages not necessarily contiguous or even in the + + \ No newline at end of file diff --git a/assets/txts/pg_0132.txt b/assets/txts/pg_0132.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dece05fe95a1d37231865d4a37eb7d9edfe4912 --- /dev/null +++ b/assets/txts/pg_0132.txt @@ -0,0 +1,43 @@ +100 + +BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES + +right order, as illustrated in Figure 4.1. +fs : Xd → Y, where Y is a product space of L pages, +Y = Y1 × ... × YL , with {Yj = [C] : j ∈ [L]}. (4.4) +A very concrete example of how the label sets [C] and [K] can differ is in a +loan application use-case where national registry proofs need to be sent: If two +pages are sent with the front and back of the ID-card, fs requires two labels +(id_front, id_back), whereas fd requires a single document label (id_card). +A critical note is due to differentiate page stream segmentation (PSS) [128, 328, +494] and page stream classification as defined above (fs ). PSS treats a page +stream as a binary classification task to identify document boundaries, without +classifying the identified documents afterward. fs considers the task in one +stage where C is constructed in a way to send atomic units such as a wage slip +in Figure 4.1 for individual downstream processing or it can be combined to +a single document label from [K] based on assigned page labels. Two-stage +processing is possible by applying PSS as an instance of a fs classifier with +[C] = {0, 1} where 1 indicates a document boundary, followed by fd . +Definition 13 [Page Splitting]. (V) A multipage image m contains multiple +page objects of similar types which can have multiple orientations, page +dimensions, and often physical overlap from poor scanning [132]. A standard +example involves multiple receipts to be analyzed for reclaiming VAT. While a +complete approach will consist of localizing pages (using edge/corner detection, +object detection, or instance segmentation) and identifying page types, we will +only focus on the latter. For instance, multipage splitting can be defined as a +preliminary check on how many page types are present in a multipage image +(with input dimensionality similar to a single page p): +fm : Xp → Y, where Y = ZC . + +(4.5) + +Payment proofs such as tickets and receipts more often are packed together +due to their compactly printed sizes, which would require splitting the unique +documents from within a page to send individually for further processing. +Following the national registry example. another rare yet “economical" variation +for fd occurs when a single page contains both the front and back of the ID card +stitched together. These edge cases (rightmost example in Figure 4.1) should +be dealt with on a case-by-case basis for how to set up [K] (e.g., specific label: +multi-tickets). + + \ No newline at end of file diff --git a/assets/txts/pg_0133.txt b/assets/txts/pg_0133.txt new file mode 100644 index 0000000000000000000000000000000000000000..1932172e08a078f761781bd96f918f66fbb983d2 --- /dev/null +++ b/assets/txts/pg_0133.txt @@ -0,0 +1,46 @@ +BALANCING RESEARCH & APPLICATIONS + +101 + +The formalisms defined above establishes a taxonomy of DC tasks, which will +be retaken in the discussion of challenges to align DC research and applications +(Section 4.5). + +4.3 + +Balancing Research & Applications + +Having established a taxonomy, we further sketch the role of DC in the larger +scope of VDU, both in the applications and research context. We point to related +VDU benchmarks and describe current DC datasets with their relevant (or missing) +properties using the task formalizations. Next, we link to related initiatives +in dataset construction and calls for reflection on DU practices. Finally, we +introduce the curated DC datasets to support multipage DC (fd ) benchmarking, +which will be used in a further experimental study. +General Benchmarking in VDU: In any industrial application context where +information transfer and inbound communication services are an important +part of the day-to-day processes, a vast number of documents have to be +processed. To provide customers with the expected service levels (in terms +of speed, convenience, and correctness) a lot of time and resources are spent +on categorizing these documents and extracting crucial information. Complex +business use cases (such as consumer lending, insurance claims, real estate +purchases, and expenditure) involve processing bundles of different documents +that clients send via any communication channel. For example, obtaining a loan +typically entails sending the following documents to prove solvency: a number +of monthly pay stubs, bank statements, tax forms, and national registry proofs. +Furthermore, not all documents are born-digital (BD), and as an artifact of the +communication channel (bulk scans/photographs, digitization of physical mail), +a single client communication can contain an arbitrary amount of document +page images in an unknown order, requiring an fs classifier. Figure 4.1 provides +an overview of the different DC tasks that arise in application scenarios, which +are scarcely covered by DC research benchmarks (see Table 4.2). As RVL-CDIP +is the only large-scale non-synthetic DC benchmark, we discuss it in more detail, +other dataset descriptions can be found in Supplementary. +Current state-of-the-art DU research based approaches [15, 187, 259] leverage the +“pretrain and fine-tune" procedure that performs significantly well on popular +DU benchmarks [165, 188, 197, 544] (see Table 4.1). However, their performance +drops significantly when exposed to real-world business use cases mainly due +to the following reasons: (1) The models are limited to modeling page-level +context due to heavy compute requirements (e.g., quadratic complexity of + + \ No newline at end of file diff --git a/assets/txts/pg_0134.txt b/assets/txts/pg_0134.txt new file mode 100644 index 0000000000000000000000000000000000000000..f249a56a26181ccec64be1afb9affbdb3a02aa73 --- /dev/null +++ b/assets/txts/pg_0134.txt @@ -0,0 +1,252 @@ +102 + +BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES + +Dataset + +Size + +Data Source + +Domain + +Task + +IIT-CDIP [252] +RVL-CDIP [165] +RVL-CDIP-N [241] +TAB [328] +FUNSD [197] +SP-DocVQA [308] +OCR-IDL [40] +FinTabNet [543] +Kleister-NDA [432] +Kleister-Charity [432] +DeepForm [435] +TAT-QA [550] +PubLayNet [544] +DocBank [261] +PubTabNet [545] +DUDE [468] +Docile [422] +CC-PDF [460] + +35.5M +400K +1K +44.8K +199 +12K +26M +89.7K +3.2K +61.6K +20K +2.8K +360K +500K +568K +40K +106K +1.1M + +UCSF-IDL +UCSF-IDL +Document Cloud +UCSF-IDL +UCSF-IDL +UCSF-IDL +UCSF-IDL +Annual Reports S&P +EDGAR +UK Charity Commission +FCC Inspection +Open WorldBank +PubMed Central +arxiv +PubMed Central +Mixed +EDGAR & synthetic +Common-Crawl (2010-22) + +Industry +Industry +Industry +Industry +Industry +Industry +Industry +Finance +US NDAs +Legal +Forms broadcast +Finance +Scientific +Scientific +Scientific +Multi-domain +Industry +Multi-domain + +Pretrain +DC +DC +DC +KIE +QA +Pretrain +TSR +KIE +KIE +KIE +QA +DLA +DLA +TSR +QA +KIE +Pretrain + +OCR + +Layout + +7 +7 +7 +7 +3 +3 +3 +7 +3 +3 +3 +3 +7 +3 +7 +3 +3 +7 + +7 +7 +7 +7 +7 +7 +7 +3 +7 +7 +7 +7 +3 +3 +3 +7 +7 +7 + +Table 4.1. DU Benchmarks with their significant data sources and properties. +Acronyms for tasks DC: Document Classification DLA: Document Layout Analysis +KIE: Key Information Extraction QA: Question Answering TSR: Table Structure +Recognition +Dataset + +Purpose + +NIST [98] +MARG [290] +Tobacco-800 [553] +TAB [328] +Tobacco-3482 [232] +RVL-CDIP [165] +RVL-CDIP-N [241] +RVL-CDIP-O [241] +RVL-CDIP_MP +RVL-CDIP-N_MP + +fs +fs +fs +fs +fp +pretraining, fp +fp , OOD +fp , OOD +fd +fd , OOD + +#d + +#p + +|Y| + +Language + +Color depth + +±400K +1002 + +5590 +1553 +800 +44.8K +3482 +400K +1002 +3415 +E[L] = 5 +E[L] = 10 + +20 +2 +2 +2 +10 +16 +16 +1 +16 +16 + +English +English +English +English +English +English +English +English/Mixed +English +English + +Grayscale +RGB +Grayscale +Grayscale +Grayscale +Grayscale +RGB +RGB +Grayscale +RGB + +Table 4.2. Statistical Comparison of public and proposed extended multipage DC +datasets. OOD refers to out-of-distribution detection. #d and #p refer to number of +documents or pages, respectively. For the novel MP datasets, we report the average +number of pages. + +self-attention [473]), effectively treating each document page as conditionally +independent and potentially missing out on essential classification cues. (2) The +methods are heavily reliant on the quality of OCR engines to extract spatial local +information (i.e. mostly at word level) suitable to solve downstream benchmark +tasks; but fail to generalize well on business documents. (3) Existing datasets +used for pretraining [165, 252] are different in terms of domain, content, and +visual appearance from many downstream DC tasks (detailed in Section 4.5.3). + + \ No newline at end of file diff --git a/assets/txts/pg_0135.txt b/assets/txts/pg_0135.txt new file mode 100644 index 0000000000000000000000000000000000000000..8321da50d5d8e0fbd6eeb2551574e7d8f378107c --- /dev/null +++ b/assets/txts/pg_0135.txt @@ -0,0 +1,37 @@ +BALANCING RESEARCH & APPLICATIONS + +103 + +Therefore, it can be challenging for industry practitioners to choose a specific +model to fine-tune for the DC use cases and task specifics that they commonly +encounter. +RVL-CDIP The Ryerson Vision Lab Complex Document Information +Processing [165] dataset used the original IIT-CDIP (The Illinois Institute +of Technology dataset for Complex Document Information Processing) [252] +metadata to create a new dataset for document classification. It was created +as the equivalent of ImageNet in the VDU field, which invited a lot of multicommunity (Computer Vision, NLP) efforts to solve this dataset. It consists of +low-resolution, scanned documents belonging to one of 16 classes such as letter, +form, email, invoice. +Proposed Datasets RVL-CDIP_MP is our first contribution to retrieve the +original documents of the IIT-CDIP test collection which were used to create +RVL-CDIP. Some PDFs or encoded images were corrupt, which explains that +we have around 500 fewer instances. By leveraging metadata from OCR-IDL +[40], we matched the original identifiers from IIT-CDIP and retrieved them +from IDL using a conversion. However, the same caveats for RVL-CDIP apply. +RVL-CDIP_MP-N can serve its original goal as a covariate shift test set, now for +multipage document classification. We were able to retrieve the original full +documents from DocumentCloud and Web Search. As no existing large-scale +datasets include granular page-level labeling (in terms of [C]) for multipage +documents, we could not create a benchmark for evaluating fs . Appendix B +points to visualizations from the proposed datasets. +Related Initiatives General benchmarking challenges have driven the VDU +research community to set the seed for initiatives to create its own documentoriented “ImageNet” [399] challenge over which multiple long-term grand +challenges can be defined (deepdoc2022, scaldoc2023). In another task paradigm, +DocuVQA, there have been efforts in the same spirit to redirect focus to +multipage documents [451, 467]. For the task of KIE, [424] launched a similar call +for practical document benchmarks closer to real-world applications. While these +initiatives demonstrate a similar-looking future direction, our contribution goes +beyond introducing novel datasets and seeks to guide the complete methodology +of DC benchmarking. + + \ No newline at end of file diff --git a/assets/txts/pg_0136.txt b/assets/txts/pg_0136.txt new file mode 100644 index 0000000000000000000000000000000000000000..eaa7a4c74c5e1080719c39f4a3930ab4994b8abb --- /dev/null +++ b/assets/txts/pg_0136.txt @@ -0,0 +1,71 @@ +104 + +BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES + +4.4 + +Experimental Study + +To classify a multipage document, one might ask the question “Why not just +predict based on the first page? What would be the gain of processing all +pages? What baseline inference strategies can be applied to classify a multipage +document?". This prompted us to put these assumptions to the test in a small +motivating study4 . +As current public datasets only support page classification, we have extended +some existing DC datasets to already enable testing a slightly more realistic, yet +more complex document classification scenario (fd ). +We have reconstructed the original PDF data of the DC datasets in Section 4.3. +The goal of this experiment is to tease some issues and strategies when naively +scaling beyond page-level DC. Our baseline of choice is the document foundation +model DiT-Base [259], which as a visual-only fp is competitive with more +compute-intensive multimodal, OCR-based pipelines [15, 187, 443]. +Inference + +Strategy + +sample + +first +second +last +max confidence +soft voting +hard voting +grid + +sequence + +grid +document + +(not tested) + +Scope +page +page +page +page +page +page +document +document + +Table 4.3. Tested inference methods to classify multipaged documents and simulate +a true document classifier fd . Scope refers to the independence assumption taken at +inference time. + +Table 4.3 overviews some straightforward inference strategies. Consider the +simplest inference strategy is to sample a given page with index l ∈ [L] (or in +our case {1, 2, L − 1}) from ŷ l = [fp (x)]l . The sequence strategies mainly differ +in how the final prediction ŷ is obtained from predictions per page, assuming a +probabilistic classifier f˜p : Xp → [0, 1]K . +MaxConf(x, y) = argmax[f˜p (x, y)]lk +l∈[L] +k∈[K] +4 Code + +provided at: https://huggingface.co/bdpc/src + +(4.6) + + \ No newline at end of file diff --git a/assets/txts/pg_0137.txt b/assets/txts/pg_0137.txt new file mode 100644 index 0000000000000000000000000000000000000000..9aa9aae4e2183df6fda071decd7f6c44d2a1b4a6 --- /dev/null +++ b/assets/txts/pg_0137.txt @@ -0,0 +1,175 @@ +EXPERIMENTAL STUDY + +105 + +SoftConf(x, y) = argmax +k∈[K] + +L +X + +[f˜p (x, y)]l + +(4.7) + +l=1 + +HardVote(x, y) = argmax +k∈[K] + +L +X + +(4.8) + +eŷl , + +l=1 + +with e a one-hot vector of size K. The grid strategy is intuitive as we tile +all page images in an equal-sized grid that trades off the resolution to jointly +consume all document pages. While results in this experiment with fairly low +grid resolution (224 x 224) are poor, variations (with aspect-preserving [247] or +layout density-based scaling) deserve to be further explored. +Strategy + +Acc↑ + +F1↑ + +fp $ [259] +first +second +last +MaxConf +SoftVote +HardVote +grid + +93.345 +91.291 +87.295 +85.091 +91.407 +91.220 +85.995 +72.642 + +93.351 +91.286 +87.305 +85.060 +91.453 +91.185 +86.182 +72.045 + +F1M ↑ +93.335 +91.271 +87.277 +85.028 +91.344 +91.236 +85.781 +73.266 + +ECE↓ + +AURC↓ + +0.075 +0.073 +0.070 +0.072 +0.124 +0.134 +0.085 +0.109 + +0.010 +0.014 +0.029 +0.038 +0.006 +0.004 +0.018 +0.042 + +Table 4.4. Base classification accuracy of DiT-base [259] (finetuned on RVL-CDIP) +evaluated on the test set of RVL-CDIP_MP per baseline fd strategy. Best results per +metric are boldfaced. $ refers to our reproduction of results. + +Strategy + +Acc↑ + +F1↑ + +fp [241] +first +second +last +MaxConf +SoftVote +HardVote +grid + +78.643 +78.760 +64.939 +64.228 +76.321 +73.984 +67.480 +47.755 + +81.947 +75.316 +58.741 +58.192 +72.855 +69.163 +63.188 +40.645 + +F1M ↑ +60.564 +60.801 +50.773 +48.859 +57.470 +56.486 +52.235 +38.584 + +ECE↓ + +AURC↓ + +0.105 +0.144 +0.132 +0.128 +0.180 +0.183 +0.110 +0.102 + +0.076 +0.025 +0.071 +0.074 +0.042 +0.039 +0.088 +0.170 + +Table 4.5. Base classification accuracy of DiT-base [259] (finetuned on RVL-CDIP) +evaluated on the test set of RVL-CDIP_N_MP per baseline fd strategy. Best results per +metric are boldfaced. + +Following similar calls in the VDU literature [468] to establish calibration and +confidence ranking as default evaluation metrics, we include Expected Calibra- + + \ No newline at end of file diff --git a/assets/txts/pg_0138.txt b/assets/txts/pg_0138.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e0dd246b6af500494b73a6960d37692b225f215 --- /dev/null +++ b/assets/txts/pg_0138.txt @@ -0,0 +1,66 @@ +106 + +BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES + +tion Error (ECE) [156, 332, 340] to evaluate top-1 prediction miscalibration +and Area-Under-Risk-Coverage-Curve (AURC) [138, 193] to measure selective +(proportion of test set%) accuracy (cf. Section 2.2.3). +Results in Tables 4.4 and 4.5 demonstrate that classifying by only the first page +is a solid strategy, with performance dropping when considering only later pages. +Maximum confidence and soft voting require L (pages) times more processing, +yet attain similar performance as the best single-page prediction. However, +this could be attributed to two factors: i) dataset creation bias since [165] +constructed RVL-CDIP from a page of each original .tiff file, for which the label +was kept if it belonged to one of the 16 categories, whereas RVL-CDIP-N [241] +consistently chose the first-page; ii) documents are fashioned in a summarydetail or top-down content structure over pages. To confirm the validity of the +latter hypothesis, more robust experiments on more fine-grained labeled DC are +needed. +The results from Table 4.4 and Table 4.5 can be interpreted as an upper +bound (i.i.d.) and a loose lower bound (non-i.i.d., yet related), respectively. +For the former, MaxConf is the most accurate, yet compared to SoftVote has +worse AURC, potentially making SoftVote a better candidate for industry use +where controlled risk is more valued. While this trend is not reproduced in +RVL-CDIP_N_MP, it can be explained by the more consistent first-page labeling, +adding distracting classification cues from later pages. +Dataset +RVL-CDIP_MP + +RVL-CDIP_N_MP + +Acc↑ + +Strategy +(∗) + +∆ + +first+second +first+last(∗) +second+last(∗) +first+second/last(∗) + +93.795 2.504 +93.675 2.384 +89.709 −1.583 +94.454 3.163 + +first+second(∗) +first+last(∗) +second+last(∗) +first+second/last(∗) + +83.638 4.878 +83.130 4.370 +71.545 −7.215 +84.553 5.793 + +Table 4.6. Best-case classification accuracy indicated with (∗) when combining +’knowledge’ over different pages. ∆ refers to the absolute difference with the first page +only. + +To answer what can be gained from processing a multipage document in a single +shot, Table 4.6 reports a best-case error analysis, where a page prediction is +counted as correct if the model would have had access to the other pages. This +is calculated by using a bit-wise OR operation between the one-hot vectors + + \ No newline at end of file diff --git a/assets/txts/pg_0139.txt b/assets/txts/pg_0139.txt new file mode 100644 index 0000000000000000000000000000000000000000..b153178b2d7eaf7a24a09345bd9fd619b7572341 --- /dev/null +++ b/assets/txts/pg_0139.txt @@ -0,0 +1,48 @@ +CHALLENGES AND GUIDELINES + +107 + +(I[y == ŷ]) expressing correctness for each strategy model. As a proof of concept, +this shows that targeting multipage document representations and inference is +a promising avenue to improve DC. + +4.5 + +Challenges and Guidelines + +Following the introduced task formalizations of Section 4.2, we claim that the +distribution on which document classification is currently evaluated publicly +and the real-world distributions have heavily diverged. Additionally, our +experimental validation on the novel datasets demonstrated the potential +of multipage DC, empirically reinforcing our call to action on improving +DC methodologies. Let P A (X, Y ) and P R (X, Y ) denote those two distinct +distributions, real-world applications and research respectively. Further, we will +characterize the specific divergences with concrete examples and suggestions for +better alignment. + +4.5.1 + +Divergence of Tasks: f + +The challenge of directly processing multipaged documents is typically avoided +by current DC models which only support single-page images [15, 153, 187, 216, +247, 263, 371, 443]. Whenever a new DU model innovation happens, the impact +for document classification is publicly only measured on the first task scenario +(e.g., fp on RVL-CDIP), whereas production DU systems more often need to +deal with the other settings (II,III,IV,V) in Figure 4.1. Moving beyond the +limited page image context will test models’ ability to sieve through potentially +redundant and noisy signals, as the classification can be dependent on very +local cues such as a single title on the first page or the presence of signatures on +the last page. Without any datasets to test this ability, we also cannot blindly +assume that we can simply scale fp classifiers to take in more context or that +aggregating isolated predictions over single pages is a future-proof (performant +and efficient) strategy, as our experiments have shown. +While p is a natural processing unit for humans, acquiring supervised annotations +for every single page can be more expensive than attaching a single contentbased label (from [K]) to a multipage document. However, fine-grained labeling +with fs could allow for more targeted and constrained KIE, as knowing a +certain page l has label y l = id_front ∈ [C] will allow you to focus on specific +entities such as national registry number, date/place of birth. Ultimately, these +classification task formulations can also help one consider how to set up f +directly and annotate document inputs, depending on the DC use-case. + + \ No newline at end of file diff --git a/assets/txts/pg_0140.txt b/assets/txts/pg_0140.txt new file mode 100644 index 0000000000000000000000000000000000000000..acec94c569d8e26aec7ce07f191ef7cd9224f0e4 --- /dev/null +++ b/assets/txts/pg_0140.txt @@ -0,0 +1,46 @@ +108 + +4.5.2 + +BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES + +Divergence of Label Space: Y + +Current benchmarks often use simplified label sets that are difficult to reconcile +with industry requirements. While RVL-CDIP is the de facto standard for +measuring performance on fp DC, recent research [242] has revealed several +undesirable characteristics. It supports only 16 labels that pertain to a limited +yet generic subset of business documents, which is far from the 1K classes in +ImageNet on whose image it was modeled. Real-world DC use cases typically +support a richer number of classes (K ∼ 50-400). RVL-CDIP suffers from +substantial label noise, estimated to be higher than current state-the-art fp +error rates (see [242] for a detailed analysis) which are overfit to noise. Due +to the absence of original labeling guidelines, the labels in RVL-CDIP can be +ambiguous, containing disparate subtypes (e.g., business cards in the resume +category), and inconsistencies between classes (cheques present in both budget +and invoice). Other errors include (near-)duplicates causing substantial overlap +between train and test distributions, corrupt documents, and plain mislabeling. +However, many common CV benchmarks are plagued by similar issues [31] and +would benefit from relabeling campaigns [519] to maintain their relevance. +Considering the above, multi-label classification (not covered explicitly in +Section 4.5.1) could be a solution to resolve label ambiguities, yet this requires +absolute consistency in label assignments, which when lacking introduces even +more label noise. The highest labeling quality could arise from consistent +labeling at the page level and hierarchically aggregating page labels (C → K), +yet granular annotations are more expensive to obtain. Alternatively, it may +be better to follow the mutually exclusive and collectively exhaustive (MECE) +principle [72] to construct label sets at the document level. +Finally, an overlooked aspect of current benchmarks is that label sets [K] can +be constructed based on some business logic, where a very local cue can lead +to a class assignment such as some checked box on page 26. Admittedly, this +does conflate the tasks of document object detection, KIE, and DC within a +single label set. However, the current focus on classes with plenty of evidence +across a document, with more global classification cues, should be balanced +with document types that rely on local cues. +Taking the above issues into account, the community should work together +towards developing more effective and realistic DC datasets that better align +with the needs of industry practitioners. While tackling the challenge of Y +divergence was out-of-scope for the contributed datasets, the next Subsection +gives systematic recommendations for obtaining better future DC benchmarks. + + \ No newline at end of file diff --git a/assets/txts/pg_0141.txt b/assets/txts/pg_0141.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c5823faa90cd073901bc84c5ff6ac7be9fc4d32 --- /dev/null +++ b/assets/txts/pg_0141.txt @@ -0,0 +1,40 @@ +CHALLENGES AND GUIDELINES + +109 + +Figure 4.2. Divergence of input data. The first image is an example from DC +benchmark RVL-CDIP [165], the second one from Docile [422] for KIE, while the +third one comes from Info-VQA [310], illustrating the visual-layout richness of modern +VRDs vs. the monotonicity of most DC document data. + +4.5.3 + +Divergence of Input Data: X + +We offer suggestions for future benchmark construction efforts such that they +take into account what properties are currently unaccounted for, organically +improving on our first pursuit towards multipage DC benchmarking. +We argue that current VDU benchmarks fail to account for many real-world +document data complexities: multiple pages, the distinction between born-native, +(mobile) scanned documents, accounting for differences in quality, orientation, +and resolution. Additionally, the UCSF Industry Document Library (and +in consequence all DC datasets drawn from this source) contains mostly old +(estimated period 1950s to 2002), type-written black and white documents, while +in reality, modern documents can have multiple channels, colors, and (embedded) +fonts varying in size, typeface, typography. Recently, there have been efforts to +collect more modern VRD benchmarks for tasks such as DocVQA [310, 468], +KIE [422], DLA [362]. Modern VRDs contain visual artifacts such as logos, +checkboxes, barcodes, and QR codes; geometric elements such as rectangles, +arrows, charts, diagrams, ..., all of which are not frequently encountered with the +same variety in current benchmarks. Future DC benchmarks should incorporate +modern VRDs to bring more diversity and variability in input data. +When developing DU models, it is therefore important to consider the role of +vision, language, and layout and how these are connected to the classification +task. For example, current datasets are based on tobacco industry documents +containing very domain-specific language, which a less robust classifier can overfit +(e.g., the spurious cue of a particular cigarette brand indicates an invoice). We +highlight that document data can be multi-lingual, and code-switching is fairly +common in document-based communications. For instance, an email may be in +one language while the attachment is in another language. + + \ No newline at end of file diff --git a/assets/txts/pg_0142.txt b/assets/txts/pg_0142.txt new file mode 100644 index 0000000000000000000000000000000000000000..501eb1a86df94e59ca79de6c3de066dc2eb734b5 --- /dev/null +++ b/assets/txts/pg_0142.txt @@ -0,0 +1,41 @@ +110 + +BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES + +In summary, future benchmarks must contain multipage, multi-type, multiindustry (e.g., retail vs. medical invoice), multi-lingual documents with a wide +range of document data complexities to build and test generic DC systems. +The community should explore potential solutions to the lack of adequate +datasets for testing DC models such as i) leveraging public document collections, +ii) synthetic generation, and iii) anonymization. +Public document collections: There are increasingly more (non-profit) +organizations (e.g., DocumentCloud), governments (SEC EDGAR), financial +institutions (World Bank Documents & Reports), and charities (Guidestar) that +make business-related documents publicly available for transparency in their +operations and archival/research purposes. These collections provide datasets +that are closer to real-world scenarios. However, these documents are typically +unlabelled, although annotations could be crowd-sourced through combined +funding from interested parties. Since most document data sources restrict +automated crawling or document scraping, future dataset constructions will +require some cooperation and creativity, whilst fulfilling licensing, ethical, and +legal requirements. A specific highlighted initiative is CC-PDF [460], which +collected modern, multi-lingual VRDs from CommonCrawl for future use. +Data synthesis: This alternative was suggested by prior work on KIE [30, 424] +and DLA [37] for generating business and scientific documents. [422] followed up +on this, delivering a large-scale KIE dataset with 6K real documents annotated +and 100K synthetic examples. However, synthetic generation can be challenging +to simulate real-world documents with similar data and classification complexity. +Anonymization can be a viable option to construct a DC dataset without +compromising ethical guidelines and privacy regulations. This process involves +removing, masking, replacing, or obfuscating data so that document content +can no longer be attributed to an individual or entity. For example, one should +remove names, addresses, and identifying information such as social security +numbers or replace it with a textual tag ([social-security-number]) or similar +pattern (e.g., Faker). While this process is not viable for creating KIE datasets, +KIE can play a big role in semi-automatically anonymizing documents [143, 366]. +Companies may be hesitant to make document collections public due to concerns +about privacy, confidentiality and GDPR compliance. While anonymization can +be an effective method, it should be approached with caution as potential risks +of re-identification can make someone with originally good intentions legally +liable. A potential side-step can be investing in privacy-preserving federated + + \ No newline at end of file diff --git a/assets/txts/pg_0143.txt b/assets/txts/pg_0143.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5335b97ca9f5bcc48551a01e2f190c0c50bdbb9 --- /dev/null +++ b/assets/txts/pg_0143.txt @@ -0,0 +1,47 @@ +CHAPTER CONCLUSION + +111 + +learning (e.g., PFL-DocVQA) to allow access to private industry document +data. + +4.5.4 + +Maturity of Evaluation Methodology + +Most DC models are evaluated using predictive performance metrics such as +accuracy, precision-recall, and F1-score on i.i.d. test sets. However, in user-facing +applications, calibration can be as important as accuracy [156, 332, 340]. Even +more so, when the confidence estimation of a DC is used to triage predictions to +either an automated flow or manual processing by a human. Once a DC is in +production, the i.i.d. assumption will start to break, which would recommend +a priori testing of robustness against various sources of noise (OCR, subtle +template changes, wording or language variations, ...) and expected distribution +shifts (born-digital-scanning artifacts, shifting page order, page copies, irrelevant +or out-of-scope documents, novel document classes, concept drift, ...). +Nevertheless, we observe only a few applications in DC (only reported on fp ) of +more mature evaluation protocols [193] beyond predictive performance. Notable +exceptions include covariate shift detection from document image augmentations +[304], sub-class shift and generalization in [241, RVL-CDIP-N], out-ofdistribution detection [241, RVL-CDIP-O], and cross-domain generalization +[23, (RVL-CDIP ↔ Tobacco-3482)]. However, the results on the latter can +be misleading as both datasets are drawn from a similar source distribution. +Another gap in DC benchmarking concerns evaluating selective classification +[138, 193], which is closer to the production value evaluation of how many +documents can be automated without any human assistance. +Another interesting evaluation protocol concerns out-of-the-box performance +or how data-hungry/sample-efficient a certain model is. In practice, few-shot +learning from minimal annotations is a highly valued skill. This few-shot learning +evaluation protocol has been applied in [402] with different data regimes. Finally, +inference complexity (time-memory) has been brought back to the attention +of OCR-free models [216], which we believe will be the key to measuring when +scaling solutions to multipage documents. + +4.6 + +Chapter Conclusion + +Our work represents a pivotal step forward in establishing multipage DC by +proposing a comprehensive benchmarking and evaluation methodology. Thereby, +we have addressed longstanding challenges and limitations (Section 4.5) that + + \ No newline at end of file diff --git a/assets/txts/pg_0144.txt b/assets/txts/pg_0144.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f7506750208659ffc93b9488e9c26852b4d79b3 --- /dev/null +++ b/assets/txts/pg_0144.txt @@ -0,0 +1,19 @@ +112 + +BEYOND DOCUMENT PAGE CLASSIFICATION: DESIGN, DATASETS, AND CHALLENGES + +have hindered progress in the field. As motivated in our experimental study, +we have proven the need to advance multipage document representations and +inference. +Following up on this, we provide recommendations for future DC dataset +construction efforts pertaining to the type and nature of document data, variety +in and quality of the classification label set, with a focus on particular DC +scenarios closer to applications, and finally how future progress should be +measured. Nonetheless, we are hopeful that the VDU community can come +together on these shortcomings and apply the lessons from this reality check. +Extending the applicability of current state-of-the-art models in VDU to multipage +documents needs further exploration, which will go hand in hand with benchmark +creation initiatives or incorporating multiple DC task annotation layers on a +single dataset. + + \ No newline at end of file diff --git a/assets/txts/pg_0145.txt b/assets/txts/pg_0145.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d49384de141b603aa3f34163d3f1d0866f16532 --- /dev/null +++ b/assets/txts/pg_0145.txt @@ -0,0 +1,3 @@ +113 + + \ No newline at end of file diff --git a/assets/txts/pg_0146.txt b/assets/txts/pg_0146.txt new file mode 100644 index 0000000000000000000000000000000000000000..68d9d09ebe97a1cae925f380f194163641dda0ab --- /dev/null +++ b/assets/txts/pg_0146.txt @@ -0,0 +1,42 @@ +114 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +Chapter 5 + +Document UnderstanDing of +Everything (DUDE +) +The contents of this chapter come from two publications [468, 469]: +Jordy Van Landeghem, Rubèn Tito, Łukasz Borchmann, Michał Pietruszka, Dawid Jurkiewicz, +Rafał Powalski, Paweł Józiak, Sanket Biswas, Mickaël Coustaty, and Tomasz Stanisławek. +ICDAR 2023 Competition on Document UnderstanDing of Everything (DUDE). In +International Conference on Document Analysis and Recognition, pages 420–434. Springer, +2023 +Jordy Van Landeghem, Rubèn Tito, Łukasz Borchmann, Michał Pietruszka, Pawel Joziak, +Rafal Powalski, Dawid Jurkiewicz, Mickaël Coustaty, Bertrand Anckaert, Ernest Valveny, +Matthew Blaschko, Marie-Francine Moens, and Tomasz Stanisławek. Document Understanding +Dataset and Evaluation (DUDE). In Proceedings of the IEEE/CVF International Conference +on Computer Vision, pages 19528–19540, 2023 + +The first publication on the Document UnderstanDing of Everything (DUDE) +competition was selected for oral presentation at ICDAR 2023. The second +publication on the DUDE dataset and benchmark was featured as a poster +presentation at ICCV 2023. +This multi-party collaboration (6 universities and 3 companies) with many +brilliant researchers involved the creation of a new dataset and benchmark, the +organization of a competition, and the publication of the results. For clarity, +we will refer to the DUDE competition as the ICDAR 2023 competition, and +the DUDE dataset and benchmark as the ICCV publication. +Author declarations: https://drive.google.com/file/d/1AmSxTOLk1Lo61sgWLd5FN5OMNQEgam_v +In short, I conceptualized the project, was responsible for the dataset creation, +annotation, and benchmarking (encoder-only models, T5, HiVT5), designed +evaluation and confidence estimation, and wrote the majority of the ICDAR +and ICCV paper. +The dataset is available: https://huggingface.co/datasets/jordyvl/DUDE_loader. +Benchmark code is available: https://github.com/rubenpt91/MP-DocVQA-Framework. +The competition remains open for submissions at: https://rrc.cvc.uab.es/?ch=23. + + \ No newline at end of file diff --git a/assets/txts/pg_0147.txt b/assets/txts/pg_0147.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1ceb374b56009a47363dcb1c6a951deba29c093 --- /dev/null +++ b/assets/txts/pg_0147.txt @@ -0,0 +1,48 @@ +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +115 + +Document UnderstanDing of Everything (DUDE) is a concept rooted in both +machine learning and philosophy, seeking to expand the boundaries of document +AI systems by creating highly challenging datasets that encompass a diverse +range of topics, disciplines, and complexities. Inspired by the philosophical +‘Theory of Everything’, which aims to provide a comprehensive explanation of +the nature of reality, DUDE endeavors to stimulate the development of AI +models that can effectively comprehend, analyze, and respond to any question +on any complex visually-rich document (VRD). +Incorporating philosophical perspectives into DUDE enriches the approach by +engaging with fundamental questions about knowledge understanding, and the +nature of documents. By addressing these dimensions, researchers can develop +AI systems that not only exhibit advanced problem-solving skills but also +demonstrate a deeper understanding of the context, nuances, and implications +of the information they process. +This chapter will present the Document UnderstanDing of Everything (DUDE) +dataset, benchmark and competition. It will be presented in a similar form as +the ICCV publication, extended with the results of the ICDAR competition. +In line with the standpoint in the previous chapter, we call on the Document +AI (DocAI) community to re-evaluate current methodologies and embrace the +challenge of creating more practically-oriented benchmarks. This project aims to +remediate the halted research progress in understanding visually-rich documents +(VRDs). We present a new dataset with novelties related to types of questions, +answers, and document layouts based on multi-industry, multi-domain, and +multipage VRDs of various origins, and dates. +Moreover, we are pushing the boundaries of current methods by creating +multi-task and multi-domain evaluation setups that more accurately simulate +real-world situations where powerful generalization and adaptation under lowresource settings are desired. DUDE aims to set a new standard as a more +practical, long-standing benchmark for the community, and we hope that it will +lead to future extensions and contributions that address real-world challenges. +Additionally, we present the results of the DUDE competition and discuss the +innovations demonstrated by participants. The competition was structured as +a single task with a multi-phased evaluation protocol that assesses the few-shot +capabilities of models by testing generalization to previously unseen questions +and domains, a condition essential to business use cases prevailing in the field. +Under the newly studied settings, current SOTA models show a significant +performance gap, even when improving visual evidence and handling multipage +documents. We conclude that the DUDE dataset proposed in this competition +will be an essential, long-standing benchmark to further explore for achieving +improved generalization and adaptation under low-resource fine-tuning, as +desired in the real world. To sum up, our work illustrates the importance of + + \ No newline at end of file diff --git a/assets/txts/pg_0148.txt b/assets/txts/pg_0148.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee34a8ce390095417e721efacda27dc75ae47ee2 --- /dev/null +++ b/assets/txts/pg_0148.txt @@ -0,0 +1,85 @@ +116 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +finding more efficient ways to model language, images, and layout in DocAI. + +#non-answerable + +#abstractive #counting + +Q: In which year does the Net +Requirement exceed 25,000? + +Q: How many attorneys are listed for +the plaintiffs? + +A: None + +A: Two + +#layout-navigating #graphic-intensive +Q: Are the margins of the page +uniform on all pages? +A: Yes +#multi-hop #layout-navigating + +#extractive #list + +Q: From the list of Top 10 Key +Recovery Components, which is the +last component listed on the second +page? + +Q: What are the Years mentioned in +Chart 1? +A: [2020, 2021, 2022] + +A: Hope + +... +Text + +#abstractive #graphic-intensive +Q: Does this document contain any +checkboxes? + +Page 1 + +Page 2 + +Page N + +A: No + +Figure 5.1. QA as a natural language interface to multipage VRDs. + +5.1 + +Introduction + +Early stages of research and growth in any field are characterized by enacting +proof-of-concept and demonstrating the feasibility of the proposed solution. In +the Deep Learning era, this is often echoed by building narrow and simplified +datasets that do not reflect real-world complexity, leading to models that may +not be suitable for practical use. +The field of Document Understanding (DU) is not an exception to the recent +proliferation of deep architectures, which in this case are predominantly used for +classification and information extraction from documents. However, the wide +and complex nature of documents presents many challenges that remain unsolved +or not yet addressed. One such challenge is domain generalization, where a +model trained on medical documents may not be directly applicable to financial +or tabular content. Another challenge concerns task-agnostic architectures, +where a model must be able to adapt to various DU subtasks such as document +classification, key information extraction (KIE), and question answering (QA). +Lastly, the high variability of document contents and layouts often leads to +highly imbalanced samples within document types, resulting in a long-tailed +distribution with few or almost no samples to train a model. +Despite the importance of these challenges, there is currently no DU benchmark +dataset that simultaneously addresses all of these issues. This paper proposes a +novel dataset formulated as an instance of Document Visual Question Answering +(DocVQA) to evaluate how well current DU solutions deal with multipage + + \ No newline at end of file diff --git a/assets/txts/pg_0149.txt b/assets/txts/pg_0149.txt new file mode 100644 index 0000000000000000000000000000000000000000..d32a6086558a808b29418ede46e6bc7b2eab8a86 --- /dev/null +++ b/assets/txts/pg_0149.txt @@ -0,0 +1,47 @@ +RELATED WORK + +117 + +documents, if they can navigate and reason over visual layouts, and if they can +generalize their skills to different document types and domains. +The data collection and evaluation design of DUDE naturally motivates +targeting models that can answer natural yet highly diverse questions (e.g., +regarding document elements, their properties, and compositions) for any VRD +(e.g., drawn from potentially unseen distributions of layouts, domains, and types). +The presented problem setting relates to Multi-Domain Long-Tailed Recognition +(MDLT) [507], which concerns learning from multi-domain imbalanced data +whilst addressing label imbalance, divergent label distributions across domains, +and possible train-test domain shift. Put plainly, since we cannot provide +ground truth QA pairs for, e.g., stamps, on every document type (domain), +we expect a solution to transfer the subtask ’stamp detection’ learned on +document types where stamps naturally occur (and thus training QA pairs were +created organically) to other domains. The DocVQA and MDLT formulations +of DUDE allow us to create a longstanding, challenging benchmark that in +the future can be easily extended with more subtasks formulated as QA pairs, +and domains relating to document types (see Limitations). +The contribution of this work is twofold. First, we have created DUDE, a novel +large-scale, multipaged, multi-domain, multi-industry DocVQA benchmark for +evaluating DU progress. Second, we show that the zero-shot and fine-tuned +performance of current SOTA models applied to DU lags far behind human +baselines, explained in part by the need for more holistic and efficient modeling +of language, vision, and richly structured layouts. + +5.2 + +Related Work + +Document Understanding encompasses datasets related to various subtasks +like document layout analysis [261, 544], classification [165], key information +extraction [197, 432], table extraction [427, 543, 545], and visual question +answering [308, 315, 450]. These benchmarks lead to end-to-end DU +architectures that have transformed common DocAI practices [15, 134, 153, 187, +263, 365, 371]. These task-specific benchmarks, however, are often tailored to +a single domain, limiting the ability to create and assess how well DU models +generalize to other document types and domains. To fill this gap, we adopt +a visual question answering (VQA) approach, which has been crucial in the +growth of the DU field. +The VQA paradigm provides a natural language interface for various +tasks from both computer vision and natural language processing. In +the latter, the question-answering approach has been successfully used in + + \ No newline at end of file diff --git a/assets/txts/pg_0150.txt b/assets/txts/pg_0150.txt new file mode 100644 index 0000000000000000000000000000000000000000..6194996834c021ebd0314fddcd8aef2315c54c9d --- /dev/null +++ b/assets/txts/pg_0150.txt @@ -0,0 +1,45 @@ +118 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +several domains, including medicine [202, 209, 257, 318, 338, 352, 384], opendomain knowledge [281, 291, 313, 506], emotions [41, 155], code [7, 278], +logical reasoning [282, 504, 516, 534], claim verification [185, 446, 523], and +math [10, 65, 182, 316, 529]. As a result of its ability to function as a natural +language interface for various forms of data, this paradigm has been applied to +other domains. For example, the question-answering approach is combined with +modalities such as images [13, 38, 39, 161, 353, 513], speech [237, 514], knowledge +graphs [106, 206, 408, 429, 457], videos [58, 59, 74, 158, 249], and maps [60, 359]. +Overall, the convergence of computer vision and NLP through the emergence of +VQA tasks has also opened up new avenues for research in the DU field, with +many DU datasets now including rich visual content alongside questions. Yet, +prior study on document VQA has mainly focused on single-page documents [308, +310, 449] with rare exceptions such as MP-DocVQA [451]. However, [308, 449] +pose only extractive questions where the answer follows the context on which the +question is defined as in other question answering benchmarks [235, 386, 456]. +Moreover, these datasets do not contain non-answerable questions as in established (natural language) QA datasets like [235, 387]. To the best of our knowledge there are no VQA datasets containing questions requiring lists as an answer. +There are however few text-only QA datasets that contain such answer types [83, +256, 357]. Other datasets mainly related to our work are rather domain-specific +like [310, 375, 440, 441, 551]. We give a detailed comparison of most related +document VQA datasets in Table 5.1 highlighting the major contributions. + +5.3 + +DUDE Dataset + +While DUDE shares some similarities with existing VQA datasets, a closer +comparison (see Table 5.1) highlights its unique features. We are confident +that the model’s proficiency in the areas introduced in this work will showcase +its capability to handle the intricacy and diversity of document understanding +tasks in real-world scenarios. +Documents. The dataset covers a wide range of document types, sources and +dates, as shown in Table 5.1 and Figure 5.2 where its diverse nature is confirmed +by the spread of document content representations. Moreover, it covers a broad +range of domains, including medical, legal, technical, and financial, among +others, to evaluate models’ ability to handle diverse topics and the specific +knowledge each requires. Furthermore, the dataset contains documents with +varying layouts: diverse text arrangements, font sizes, and styles, to ensure that +models can handle visually diverse documents. + + \ No newline at end of file diff --git a/assets/txts/pg_0151.txt b/assets/txts/pg_0151.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ec391fbb42c7193c0bd659ea12bed9d390c799f --- /dev/null +++ b/assets/txts/pg_0151.txt @@ -0,0 +1,37 @@ +DUDE DATASET + +119 + +TSNE Projection of 5641 Documents + +DocVQA +InfographicsVQA +Ours +TAT-DQA +VQA-CD +VisualMRC + +Figure 5.2. Visualization of inter-document similarities between samples from different +datasets (t-SNE over TF-IDF representations of 1k passages from each source). + +In contrast to our proposal, current VQA datasets often focus on homogeneous +documents, such as invoices in VQA-CD [302] or financial reports in TATDQA [551]. Even when not restricted to a single domain or layout, these +datasets share essential characteristics. For example, InfographicsVQA [310] +demonstrates significant diversity in topics and designs, but still embodies a +preference for visual aids over complex tables or long text passages. Moreover, +VQA datasets are commonly restricted to either born-digital or scanned +documents, which limits their ability to measure the robustness to mixedorigin files that one usually finds in real-world applications. In particular, this +restriction makes it uncertain whether state-of-the-art performers on website +fragments from VisualMRC [440] can be efficient on multi-column layouts and +documents with OCR errors or incorrectly-detected reading orders. Finally, a +typical dataset for document VQA contains documents from a limited period, +i.e., a few years (Table 5.1). +Considering the properties mentioned above, the most diverse dataset to date +is Single Page DocVQA (SP-DocVQA) [308], which contains mixed-origin +documents of different types created over several decades. However, it is built +exclusively on single-page document excerpts and is limited to several domains +represented in the Industry Documents Library. As a result, it complements +rather than serves as a touchstone for general-purpose DU systems. MPDocVQA [451] extends this including previous and posterior pages of the +documents. However, the questions are kept the same which makes the extra + + \ No newline at end of file diff --git a/assets/txts/pg_0152.txt b/assets/txts/pg_0152.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5d714b62f872bd954927bcf2aabdc89f04c94c5 --- /dev/null +++ b/assets/txts/pg_0152.txt @@ -0,0 +1,45 @@ +120 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +pages mere distractors. +Questions. We use VQA as a natural language interface to VRDs, challenging +the DU model with diverse questions, advanced operations, and multi-step +reasoning to achieve real-world success. +Firstly, we assert that various layouts and visual elements must be comprehended +semantically. As such, we introduce complex questions targeting these document +elements, requiring comprehension beyond the document content, such as +‘how many text columns are there?’, ‘does the document contain words with +diacritics?’ or ‘which page contains the largest table in the document?’. These +layout-navigating questions bridge the gap between Document Layout Analysis +and Question Answering paradigms. +Our unique and detailed compositional questions demand a model that +comprehends semantics and generalizes to new questions in a zero-shot setting. +For example, >90% of our questions are unique, while we target questions +whose answer scope is much more diverse than in previous works.1 Since +neural networks are known to perform poorly at mathematical reasoning and +symbolical processing, we provide training and evaluation questions demanding +arithmetic and comparison operations on numbers and dates. +Moreover, we feature multi-hop questions that indicate a model’s robustness to +sequential reasoning and mimic how humans ask questions. They may be useful +in real-world tasks such as ‘If the checkbox on page 1 section 3a indicates that +the company is incorporated, how much yearly revenue did it generate in 2022 +(given the table on page 5)?’ +Answers. Even though some VQA datasets are deliberately limited to +questions of exclusively extractive (SP-DocVQA) or abstractive (VisualMRC) +nature, others do not obey such restrictions and include both question types +(see Table 5.1). The dataset we provide includes both abstractive and extractive +answers, covering various types such as textual, numerical, dates, yes/no, lists, +or no answer. +This allows us to cover all possible business use cases and reveal major deficiencies +of existing DU systems beyond typical textual answers. For instance, no existing +VQA dataset includes not answerable questions and questions answered with a +list. In turn, the models considered to date supposedly tend to make unreliable +guesses on questions with an answer not entailed by the content [387]. Our +1 Answer + +type comparison is included in supplementary materials. + + \ No newline at end of file diff --git a/assets/txts/pg_0153.txt b/assets/txts/pg_0153.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7d8212aebd23f11b171b54eb14e872ea8cbb873 --- /dev/null +++ b/assets/txts/pg_0153.txt @@ -0,0 +1,50 @@ +DUDE DATASET + +121 + +dataset is designed to cover answers beyond plain extractive text such as a list +of items or even ‘None’. +The ‘None’ answer type demands that the model correctly identifies that the +answer cannot be provided, as the question needs to be better formed, e.g., it +asks about the value of an empty cell in the table. In addition, list generation +problems pose challenges to the model, as (1) more tokens need to be generated, +(2) they may be sourced from different places in the document, and (3) OCR +reading order may influence the element ordering. + +5.3.1 + +Gathering Documents + +A fundamental difficulty in gathering raw source files was ensuring dataset +diversity while fulfilling strict licensing requirements. Therefore, rather than +depending on initial sources of files, e.g., libraries that originally published +digitized materials, we resorted to aggregate websites. +The document collection process was manual and assumed formulating queries +to archive.org (containing 36M books and texts), commons.wikimedia.org (with +86M media types of various types), and documentcloud.org (with around 5M +public documents). The queries consisted of keywords relevant to some category +of interest, e.g., the resume category of our proposal consists of ‘resume’, ‘cv’, +‘curriculum’, and ‘biography’ keywords). Where necessary, a separate query +parameter ensured that the resulting files belonged to the public domain or +were released under a permissive license. Information on keywords and the +search procedure is distributed as a part of the DUDE dataset. +From the resulting documents, we selected those representing the requested +category and visually distinctive from the ones already gathered. Special care +was put into removing examples that visibly expose controversial content or may +be subject to privacy or legal concerns, despite the declared license. We collected +five thousand, typically multipage, English documents using this methodology. + +5.3.2 + +Annotation Process + +The annotation process involved in-house annotators and Amazon Mechanical +Turk freelancers. For the latter, there is limited control over the expertise, +and where justified, we resorted to limiting task availability depending on the +number of completed tasks and historical acceptance rate.2 The former are five +highly qualified people with a Ph.D. in Linguistics. These three annotation +2 Approval + +above 97% over at least 5k HITs. + + \ No newline at end of file diff --git a/assets/txts/pg_0154.txt b/assets/txts/pg_0154.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b13fa35345a25d9510fbd970a8c29f0159a473a --- /dev/null +++ b/assets/txts/pg_0154.txt @@ -0,0 +1,40 @@ +122 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +scenarios will be referred to as All MTurkers, Best MTurkers, and Qualified +Linguists. +We estimate the total cost of annotation involving both Linguists and MTurkers +as $20,000. +Phase 1. We started by providing All MTurkers documents described in +Section 5.3.1 in separate batches aimed at collecting abstractive, extractive, +and list QA pairs. Each freelancer was asked to propose up to five questions +of a particular type, and in the case of extractive ones to provide an evidence +bounding box. The exception to this process is the annotation of non-answerable +questions previously shown to be particularly challenging [387]. These are +predominantly annotated by Qualified Linguists and because of their quality +promoted without passing through Phases 2-3. +Candidate QA pairs are semi-automatically filtered to exclude annotations that +cannot be valid due to the length, use of non-typical character combinations, +or type-specific criteria, such as non-list answers for list batches. Additionally, +we cluster duplicate and near-duplicate question-answer pairs to ensure dataset +diversity and promote them directly to Phase 3 after a manual review (the same +QA pairs provided independently by several annotators indicate their validity). +Phase 2. The rest of the annotations promoted from Phase 1 were directed +to All MTurkers, but this time instead of providing complete QA pairs, they +were asked to answer the question from the previous round. Obtained triples of +questions and two answer variants (one from each phase) were evaluated using +inter-answer ANLS (defined in Section 5.3.5) promoted to the final dataset if +the agreement was >0.8. Otherwise, QA triples were directed to Phase 3. +Phase 3. Best MTurkers were provided with document, question, and answer +variants to decide the correctness of each answer and optionally overrule both +variants if they are not correct. Outliers from decisions in this phase, such as +repealing without a judgment on previous answers, were reviewed by Qualified +Linguists and corrected if needed. +Optional Phase 4. Annotations of the test set were reviewed by Qualified +Linguists. Given data from Phase 3, they corrected questions, answers and +created metadata related to diagnostic categories described in Section 5.3.4. + + \ No newline at end of file diff --git a/assets/txts/pg_0155.txt b/assets/txts/pg_0155.txt new file mode 100644 index 0000000000000000000000000000000000000000..39bba25868b9fed9eaee5cb20c2b5846e2a8d23c --- /dev/null +++ b/assets/txts/pg_0155.txt @@ -0,0 +1,193 @@ +DUDE DATASET + +5.3.3 + +123 + +Dataset Statistics + +Dataset + +Ours + +SP-DocVQA + +Sources + +Multi + +Dataset-level properties +Industry docs +Web pages + +VisualMRC + +InfographicsVQA TAT-DQA +Infographics + +Origin +Period +Documents +Pages (avg±std ) +Tokens (avg±std ) +Simpson +coeff. +(ResNet) +Simpson coeff. (Tf-Idf) + +BD, Scan +1860-2022 +5,019 +5.72±6.4 +1,831.53±2,545.06 +0.82 + +Mostly scans +1960-2000 +12,767 +1.0±0.0 +183±149.96 +0.76 + +BD +Jan-Mar 2020 +10,234 +1.0±0.0 +154.19±79.34 +0.83 + +BD +not specified +5,485 +1.0±0.0 +287.98±214.57 +0.86 + +Finance +reports +BD +2018-2020 +2,758 +1.11±0.32 +576.99±290.12 +0.73 + +0.95 + +0.93 + +0.99 + +0.94 + +0.15 + +Questions +Unique (%) +Length (avg±std ) +Semantics + +41,541 +90.9 +8.65±3.35 +All + +Question-level properties +50,000 +30,562 +72.34 +96.26 +8.34±3.04 +9.38±4.01 +T, L, F, Ch +T, L, F, Ch + +30,035 +99.11 +11.57±3.71 +T, L, F, Ch, M + +16,558 +95.65 +12.51±4.18 +T, L + +Unique (%) +Length (avg±std ) +Extractive (%) +Abstractive (%) +List (%) +None + +70.7 +3.35±6.1 +42.39 +38.25 +6.62 +12.74 + +Answer-level properties +64.29 +91.82 +2.11±1.67 +8.38±6.36 +100.0 +0.0 +0.0 +100.0 +0.0 +0.0 +0.0 +0.0 + +48.84 +1.66±1.43 +71.96 +24.91 +5.69 +0.0 + +77.54 +3.44±7.20 +55.72 +44.28 +0.0 +0.0 + +Table 5.1. Summary of the existing English document datasets and our challenge. BD +stands for born-digital. Layout semantics are abbreviated as (T)able, (L)ist, (F)igure, +(Ch)art, and M(ap). Comparison based on Azure Cognitive Services (3.2) OCR. + +We conducted a statistical analysis of our dataset and found that the distribution +of document length, question length, and answer type was much more diverse +than in other datasets in the same domain. We also used the Simpson diversity +coefficient [421] for analysis and summarized the results in Table 5.1. The +following are the statistics for the data split: + +documents +questions + +train + +val + +test (diagnostic) + +3,010 +23,728 + +749 +6,315 + +1,215 (530) +11,448 (2,462) + +Table 5.2. Data split counts. + +The number of tokens in the document distribution is much more diverse +compared to other datasets, a consequence of the more diverse distribution of +pages (see Figure 5.4). Note some of the documents are more visual than textual +(or even visual-only), making the left whisker essentially reach 0 (log2 -scaling +of x-axis). + + \ No newline at end of file diff --git a/assets/txts/pg_0156.txt b/assets/txts/pg_0156.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1b730578f79ff1543e00c355c3417d4fe8042b2 --- /dev/null +++ b/assets/txts/pg_0156.txt @@ -0,0 +1,29 @@ +124 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +Figure 5.3. Distribution of the number of tokens in documents, answers, and questions. + +Figure 5.4. While other datasets are predominantly single-page only, the number of +pages featuring in DUDE is more diverse, yet still biased towards shorter documents. + +The distribution of the number of tokens in answers is heavy-tailed, to some +extent this is also the property of the distribution of number of tokens in +questions. Furthermore, 90.9% of questions are unique, and so are 70.7% of +answers (taking answer variants into account). +We scrutinized the answer types by aggregating possible answers into classes +representing the information they conveyed. The study used heuristics to +determine if the answers fit into NER labeling scheme [20] or categories we +anticipated, such as yes/no and none, or did not anticipate, such as color. This +resulted in 25 different groups of answers, with the other answer type being the +fourth largest group. Cramer’s V coefficient was used to check for correlations +between question types and answer types, and the results indicated that there +were few correlations . The expected correlations, such as none answers with +not-answerable questions or yes/no answers with abstractive questions, were +present, but barely any correlation was significant. This suggests it is hard to +guess the answer based on the question solely. +We study relative diversity measure, called Simpson coefficient [421, 546]. To + + \ No newline at end of file diff --git a/assets/txts/pg_0157.txt b/assets/txts/pg_0157.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d9c2684ec636f69c0847396e16e624011fb35d0 --- /dev/null +++ b/assets/txts/pg_0157.txt @@ -0,0 +1,141 @@ +DUDE DATASET + +125 + +Complexity + +Evidence + +Form + +Operation + +Type + +1800 +1752 + +1350 +1013 + +900 +852 + +884 + +843 +667 + +643 + +450 + +428 +310 + +112 + +125 + +79 27 65 113 25 25 58 34 + +36 48 227 27 + +Com +plex +Com +(layo +plex +ut) +(m +Com ulti-hop) +plex +(othe +r) +Simp +le +Hand +writin +g +Layo +ut +P +Table lain +Visua or list +l/C +Visua +l / Ch hart +ec +Visua kbox +l / Co +lor +Visua +l/I +Visua mage +l/L +Visua ogo +l +Visua / Map +l +Visua / Other +l / St +amp +Date +Num +eric +Othe +Prop +er na r +me +Arith +metic +Com +paris +o +Coun n +ting +Norm +aliza +t +Abst ion +activ +e +Extra +ctive + +0 + +712 + +696 + +615 + +Figure 5.5. Count of particular diagnostic categories in a subset of 2.5k test set QA +pairs annotated in detail to help analyze models’ performance. + +define it, consider a fixed distance function d(a1 , a2 ) defined for pair of documents +a1 , a2 ∈ A: the dataset. In our applications, it is the cosine similarity of a +document embedding. Further, for an arbitrary number of datasets A1 , . . . , AN +the diversity of A1 with respect to A2 , . . . , AN is defined as + + +DivA2 ,...,AN (A1 ) = 1 − p d(a11 , a12 ) < min d(ai1 , ai2 ) +i=2:N + +where ai1 , ai2 ∈ Ai , are randomly selected, i = 2 : N i = 2 : N . We report +relative diversities of each of the datasets, relative to other datasets in the study, +based on two embeddings: visual (ResNet-101 embeddings-based) and semantic +(Tf-Idf embeddings-based), in Table 5.1. The results show that the probability +that two random documents from DUDE are more similar than each random +pair of documents from other datasets is small, meaning that documents in our +dataset are well-distributed and diverse. + +5.3.4 + +Diagnostic Subsets + +Following previous DU datasets, we gather diagnostic metadata for close to +half of the documents and QA pairs in the test set (see Figure 5.5). These + + \ No newline at end of file diff --git a/assets/txts/pg_0158.txt b/assets/txts/pg_0158.txt new file mode 100644 index 0000000000000000000000000000000000000000..3dcbe09556bdace1058092acfcd89725a7cee03c --- /dev/null +++ b/assets/txts/pg_0158.txt @@ -0,0 +1,42 @@ +126 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +are intended to enable a fine-grained analysis of the models’ performance. The +taxonomy used is an extension of the one from earlier works [47, 308, 310], +covering DUDE-specific questions and enables a more detailed examination +of visual artifacts under consideration. +Question type and perceived complexity. We distinguish questions perceived +as simple, i.e., those based on spotting value near a phrase mentioned +explicitly as a part of the question. For example, "Who is the Secretary +of the U.S. Department of Commerce?" when the document contains "Penny +Pritzker, Secretary, U.S. Department of Commerce." Such could be guessed +given an approximate string matching algorithm and does not require much +comprehension beyond that. The remaining questions are marked as hard with +distinguished categories of hard multi-hop questions, and hard meta/layoutnavigating questions. +Answer evidence. We provide information on what types of elements have to +be comprehended to provide an answer, including free text, handwriting, table or +list, and layout, i.e., non-tabular spatial understanding of text placement. These +follow the ontology established by previous works [47, 308, 310]. In addition, we +supply hints on graphical artifacts one needs to consider for particular questions, +such as image/photo, plot/chart, checkbox, and annotation. +Required operation. We distinguish arithmetic, comparison, counting, and +normalization operations to provide information on the need for performing, +respectively, arithmetic operations on extractable data, comparing numerical +values or sizes, counting elements or converting data present in the document +to another format (e.g., rounding or date format conversion). +Answer form/shape. Finally, we provide information on the shallow form of +the returned answer, including date, numeric, and proper name. + +5.3.5 + +Evaluation + +The evaluation process follows the typical paradigm of separate training, +validation, and test splits. We provide both a standalone evaluator and a +website3 [467] to submit test set predictions. +3 rrc.cvc.uab.es/?ch=23 + + \ No newline at end of file diff --git a/assets/txts/pg_0159.txt b/assets/txts/pg_0159.txt new file mode 100644 index 0000000000000000000000000000000000000000..387c0c8165bdb21618727f652fc46ad459b9d9b4 --- /dev/null +++ b/assets/txts/pg_0159.txt @@ -0,0 +1,42 @@ +DUDE DATASET + +127 + +To assess models’ performance, we rely on the ANLS metric introduced by +authors of the ST-VQA dataset [39]. Roughly speaking, it is a generalization +of accuracy that does not penalize the system for an answer whose similarity +to the gold standard measured with normalized Levenshtein similarity is above +a specified threshold. Moreover, the metric assumes the presence of multiple, +equally valid reference answers. The mentioned properties account for possible +OCR errors or different phrasings, such as the same numerical answer represented +as two and 2 by different annotators. +In practice, production DU systems provide an estimation of confidence in order +to triage documents that do not need to be manually reviewed by a human. +While the reliability of the automation ability of a DU solution is deemed +quintessential for generating business value in practice [48], DU research rarely +reports any confidence evaluation. Some exceptions are in closely related task +domains like scene text recognition [425] and QA [208, 531]. +With DUDE, we want to establish calibration evaluation and confidence ranking +as a default evaluation methodology in DU, especially since the field is so close +to applications. +To this end, we report (next to ANLS) two additional metrics, Expected +Calibration Error (ECE) [156, 332, 340], and Area-Under-Risk-Coverage-Curve +(AURC) [138, 193]. +Calibration requires that the probability a model assigns to its predictions +equals their true likelihood of being correct [86, 88, 520]. +ECE approximates top-1 calibration error by a weighted average over the +accuracy/confidence difference of histogram bins. Particularly in our evaluation +setting, we consider a predicted answer correct if its ANLS to the ground +truth answer is above a pre-defined threshold (τ =0.5). For consistency, notanswerable and list-answers both have confidence estimated for the answer as a +whole (regardless of the number of answers). Following [342], we apply equalsize binning (with 100 bins, Lp = 1), avoiding some pathologies of equal-range +binning [231, 463]. +AURC is a selective classification metric that evaluates how well an estimator +prevents silent failures on an i.i.d test set. As an aggregate measure of estimator +performance (ANLS) and confidence ranking, it provides a more practically +useful estimate of overall performance when the estimator can abstain from +(low-confidence) decisions and defer to a human for feedback. +By reporting the above metrics, we hope that in future work there will be +contributions (e.g., calibration methods for improved forecasting or metrics for +better predictive uncertainty evaluation) that concretely target the empirical + + \ No newline at end of file diff --git a/assets/txts/pg_0160.txt b/assets/txts/pg_0160.txt new file mode 100644 index 0000000000000000000000000000000000000000..20e379f515e3b93d9590af48b8d166a7172fa56c --- /dev/null +++ b/assets/txts/pg_0160.txt @@ -0,0 +1,48 @@ +128 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +observations of overconfidence/miscalibration in DU models. + +5.4 + +DUDE Competition + +Over the past few years, the field of Document Analysis and Recognition (DAR) +has embraced multimodality with contributions from both NLP and CV. This +has given rise to DU as the all-encompassing solution [15, 187, 371] for handling +VRDs, where layout and visual information is decisive in understanding a +document. +This umbrella term subsumes multiple subtasks ranging from KIE [197, 432], +DLA [544], VQA [310, 450], table recognition [201, 376], and so on. For each of +these subtasks, influential challenges have been proposed, e.g., the ICDAR 2019 +Scene Text VQA [38, 39] and ICDAR 2021 Document VQA (DocVQA) [308, 450] +challenges, which in turn have generated novel ideas that have impacted the +new wave of architectures that are currently transforming the DAR field. +Nevertheless, we argue that the DAR community must encompass the future +challenges (multi-domain, multi-task, multipage, low-resource settings) that +naturally juxtapose the previous competitions with pragmatic feedback attained +via its business-driven applications. + +5.4.1 + +Challenge Objectives + +We aim to support the emergence of models with strong multi-domain layout +reasoning abilities by adopting a diversified setting where multiple document +types with different properties are present. Moreover, a low-resource setting +(number of samples) is assumed for every domain provided, which formulated as +a DocVQA competition allows us to measure progress with regard to the desired +generalization (Section 5.4.3.1). Additionally, we strive for the development of +confidence estimation methods that can not only improve predictive performance +but also adjust the calibration of model outputs, leading to more practical and +reliable DU solutions. +We believe that DUDE’s emphasis on task adaptation and the capability +of handling a wide range of document types, layouts, and complexities will +encourage researchers to push the boundaries of current DU techniques, fostering +innovation in areas such as multimodal learning, transfer learning, and zero-shot +generalization. + + \ No newline at end of file diff --git a/assets/txts/pg_0161.txt b/assets/txts/pg_0161.txt new file mode 100644 index 0000000000000000000000000000000000000000..8db50d6ab57cabf5d64c10ab9b437c2e40d259fc --- /dev/null +++ b/assets/txts/pg_0161.txt @@ -0,0 +1,50 @@ +DUDE COMPETITION + +5.4.2 + +129 + +Challenge Contributions + +DUDE answers the call for measuring improvements closer to the real-world +applicability of DU models. By design of the dataset and competition, +participants were forced to make novel contributions in order to make a +significant impact on the DU task. Competitors showcased intriguing model +extensions, such as combining models that learn strong document representations +with the strengths of recent large language or vision-language models (ChatGPT +[52] and BLIP2 [258, 260]) to better understand questions and extract +information from a document context more effectively. HiVT5 + modules +extended Hi-VT5 [451] with token/object embeddings for various DU subtasks, +while MMT5 employed a two-stage pretraining process and multiple objectives +to enhance performance. These innovative extensions highlight the ingenuity in +addressing the complex challenges of document understanding. + +5.4.3 + +Motivation and Scope + +We posit that progress in DU is determined not only by the improvements in +each of its related predecessor fields (CV, NLP) but even more by the factors +connecting to document intelligence, as explicitly understood in business settings. +To improve the real-world applicability of DU models, one must consider (i) the +availability and variety of types of documents in a dataset, as well as (ii) the +problem-framing methods. +Currently, publicly available datasets avoid multipage documents, are not +concerned with multi-task settings, nor provide multi-domain documents of +sufficiently different types. These limitations hinder real-world DU systems, +given the ever-increasing number of document types occurring in various business +scenarios. This problem is often bypassed by building systems based on private +datasets, which leads to a situation where datasets cannot be shared, documents +of interest are not covered in benchmarks, and published methods cannot +be compared objectively. DUDE counters these limitations by explicitly +incorporating a large variety of multipage documents and document types. +Furthermore, the adaptability of DU to the real world is slowed down by a +low-resource setting, since only a limited number of training examples can be +provided, involving unpleasant manual labor, and subsequently costly model +development. Anytime a new dataset is produced in the scientific or commercial +context, a new model must be specifically designed and trained on it to achieve +satisfactory performance. At the same time, transfer learning is the most +promising solution for rapid model improvements, while zero- and few-shot +performance still needs to be addressed in evaluation benchmarks. + + \ No newline at end of file diff --git a/assets/txts/pg_0162.txt b/assets/txts/pg_0162.txt new file mode 100644 index 0000000000000000000000000000000000000000..1772fd898d12217ae3acd8e9d89b43f6aad77e53 --- /dev/null +++ b/assets/txts/pg_0162.txt @@ -0,0 +1,51 @@ +130 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +Bearing in mind the characteristics outlined above, we formulated the DUDE +dataset as an instance of DocVQA to evaluate how well current solutions can +simultaneously handle the complexity and variety of real-world documents +and all subtasks that can be expected. Optimally, a DU model should +understand layout in a way that allows for zero-shot performance through +attaining "desired generalization", i.e., generalization to any documents (e.g., +drawn from previously unseen distributions of layouts, domains, and types) +and any questions (e.g., regarding document elements, their properties, and +compositions). Therefore, we incorporated these criteria while designing our +dataset, which may stand as a common starting point and a cooperative path +toward progress in this emerging area. +5.4.3.1 + +Desired Generalization. + +The challenge presented by DUDE is an instance of a Multi-Domain LongTailed Recognition (MDLT ) problem [507]. +Definition 14 (Multi-Domain Long-Tailed Recognition). MDLT focuses on +learning from multi-domain imbalanced data whilst addressing label imbalance, +divergent label distributions across domains, and potential train-test domain +shift. This framework naturally motivates targeting estimators that generalize +to all domain-label pairs. +A domain D = {(xi , yi )}N +i=1 is composed of data sampled from a distribution +PXY , where X denotes an input space (documents) and Y the output space +(QA pairs). Each x ∈ X represents a document, forming a tuple of (v, l, t), +expressing a complex composition of visual, layout and textual elements. For +simplicity, consider that each ‘label’ y ∈ Y represents a question-answer pair, +relating to implicit tasks to be completed (such as date KIE in What is the +document date?). Due to the potentially compositional nature of QA, the label +distribution is evidently long-tailed. During training, we are given M domains +(document types) on which we expect a solution to generalize (Figure 5.6), both +within (different number of samples for each unique task) and across domains +(even without examples of a task in a given domain). +j +What sets apart domains is any difference in their joint distributions PXY += +6 +k +PXY . For example, an invoice is less similar (in terms of language use, visual +appearance, and layout) to a contract than to a receipt or credit note. Yet, +a credit note naturally contains a stamp stating information such as “invoice +paid”, whereas receipts rarely contain stamps. This might require a system to +transfer ‘stamp detection’ learned within another domain, say on notary deeds. + + \ No newline at end of file diff --git a/assets/txts/pg_0163.txt b/assets/txts/pg_0163.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8faa0f433c2e6ab60e689b6d61feceecc28f033 --- /dev/null +++ b/assets/txts/pg_0163.txt @@ -0,0 +1,34 @@ +DUDE COMPETITION + +131 + +Figure 5.6. Illustration of MDLT as applicable to the DUDE problem setting. The yaxis aggregates skills related to specific KIE or reasoning tasks over document elements +(checkbox, signature, logo, footnote, ...). The x-axis denotes the obtained samples +(QA pairs) per task. Each domain has a different label distribution P (Y ), typically +relating to within-domain document properties P (X).This training data exhibits label +distribution shifts across domains, often requiring zero-shot generalization (marked +red). + +Notably, it will be ‘organic’ to obtain more examples of certain questions (tasks) +in a given domain. This should also encourage models to learn a certain skill in +the domains where they have more training examples. Put plainly, it is better to +learn checkbox detection on contracts than on invoices, which rarely contain any. +This MDLT framework allows us to create a lasting, challenging benchmark +that can be easily extended in the future with more tasks (formulated as QA +pairs) and domains (relating to document types). In the first iteration of the +DUDE competition, we have targeted specific skills by guiding annotators with +focused instructions, which we share for future extensions. + +5.4.4 + +DUDE Competition Protocol + +The ICDAR 2023 competition on Document UnderstanDing of Everything took +place from February to May of 2023. A training-validation set with 30k QA +annotations on 3.7k documents was given to participants at the beginning of +February. The 11.4k questions on 12.1k documents for the test set were only +made accessible for a window between March and May. Participants were +asked to submit results obtained on the public, blind test set documents rather +than deliver model executables, although they were encouraged to open-source + + \ No newline at end of file diff --git a/assets/txts/pg_0164.txt b/assets/txts/pg_0164.txt new file mode 100644 index 0000000000000000000000000000000000000000..98f11a3be5210800a1dbd5411ae6d0739ae58c5d --- /dev/null +++ b/assets/txts/pg_0164.txt @@ -0,0 +1,49 @@ +132 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +their implementations. We relied on the scientific integrity of the participants +to adhere to the competition’s guidelines specified on The Robust Reading +Competition (RRC) portal4 . +5.4.4.1 + +Task Formulation + +Given an input consisting of a PDF with multiple pages and a natural language +question, the objective is to provide a natural language answer together with +an assessment of the answer confidence (a float value scaled between 0 and 1). +Each unique document is annotated with multiple questions of different types, +including extractive, abstractive, list, and non-answerable. Annotated QA +pairs are not restricted to the answer being explicitly present in the document. +Instead, any question on aspect, form, or visual/layout appearance relative to +the document under review is allowed. +Additionally, competitors were allowed to submit results for only a specific answer +type (provided in annotations) such that, for example for extractive questions, +encoder-only architectures could compete in DUDE. Another important subtask +is to obtain a calibrated and selective DocVQA system, which lowers answer +confidence when unsure about its answers and does not hallucinate in case +of non-answerable questions. Regardless of the number of answers (zero in +the case of non-answerable or multiple in list-questions), we expect a single +confidence estimate for the whole answer to guarantee consistency in calibration +evaluation. To promote fair competition, we provided for each document three +OCR versions obtained from one open-source (Tesseract) and two commercial +engines (Azure, AWS). +5.4.4.2 + +Evaluation Protocol + +The first evaluation phase assumes only independently and identically distributed +(i.i.d.) data containing a similar mixture of document and question-answer +types for the train-validation-test splits. The same evaluation metrics as the +benchmark apply for this phase. +The (implicit) second evaluation phase created a mixture of seen and unseen +domain test data. This was launched jointly with the first evaluation phase, as +otherwise, one would be able to already detect the novel unseen domain test +samples. To score how gracefully a system deals with unseen domain data, the +evaluation metric is AUROC [270], which roughly corresponds to the probability +that a positive example (in-domain) is assigned a higher detection score than +4 https://rrc.cvc.uab.es/?ch=23 + + \ No newline at end of file diff --git a/assets/txts/pg_0165.txt b/assets/txts/pg_0165.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3d1c19d6a38cf0ecc95b8f2500dea8e2914bc45 --- /dev/null +++ b/assets/txts/pg_0165.txt @@ -0,0 +1,45 @@ +DUDE BENCHMARK + +133 + +a negative example (out-of-domain). A system is expected to either lower its +confidence or abstain from giving an answer. +There is a strict difference between a non-answerable question and an unseen +domain question. For the former, the document is from a domain that was +included during training, yet the question cannot be solved with the document +content, e.g., asking about who signed the document without any signatures +present. For the latter, the question is apt for the document content, yet the +document is from a domain that was not included during training and validation, +which we would expect the system to pick up on. +All metric implementations and evaluation scripts are made available as a +standalone repository to allow participants to evaluate close to official blind +test evaluations5 . +All submitted predictions are automatically evaluated, and the competition site +provides ranking tables and visualization tools newly adapted to PDF inputs to +examine the results. After the formal competition period, it will serve as an +open archive of results. The main competition winner will be decided based on +the aggregate high scores for ANLS, AURC, and AUROC. + +5.5 +5.5.1 + +DUDE Benchmark +Baselines + +Human performance. To establish the human baseline, we assign test set +questions to Qualified Linguists, ensuring none of them will face the same +documents as reviewed in Phase 4. The procedure results in an estimation of +74.76 ANLS points (Table 5.3). At first glance, this result seems low. Still, when +analyzing results case by case, it turns out that it’s hard to score much better +since the answer format can influence the overall results a lot: Eagle vs. an eagle +(0.625 ANLS), 62% vs. 62 (0.67 ANLS), 1958-04-29 vs. 4-29-58 (0 ANLS), +Clemson University, Clemson South Carolina vs. Clemson University (0 ANLS). +We achieved the lowest performance (67.58) on the extractive question type, +which confirms our hypothesis since the abstractive answers are shorter (mostly +numbers, yes/no, or colors). +We analyzed the maximum score achieved by the best-performing model for +each diagnostic test category and plotted that against the human performance +in Figure 5.7. +5 https://github.com/Jordy-VL/DUDEeval + + \ No newline at end of file diff --git a/assets/txts/pg_0166.txt b/assets/txts/pg_0166.txt new file mode 100644 index 0000000000000000000000000000000000000000..07c3ff24cbd24c229f9a34812b23c5c98a5f5ad0 --- /dev/null +++ b/assets/txts/pg_0166.txt @@ -0,0 +1,38 @@ +134 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +Figure 5.7. We report the average ANLS for the human expert vs. the best-performing +model per diagnostic category as a ceiling analysis. + +Reference models. We assessed a group of models to determine how their +performance is influenced by different factors such as (1) their ability to handle +textual, layout, and visual elements, (2) whether they were fine-tuned for the +task, (3) their size in (trainable parameters), and (4) the maximum input length +they can handle. +To analyze factors (1) and (2), we conducted a zero-shot evaluation of several +baseline text-only models. We used three encoder-based models (BERT [94], +Longformer [28], and BigBird [521]) that cannot generate text and three that +feature a decoder (T5 [383], GPT-3-Davinci [52], and ChatGPT) and have this +capability. Next, we extended the T5 architecture with 2D layout embeddings +[47, 371] and fine-tuned models with increasing maximum sequence lengths (512 +→ 8192) on DUDE. Finally, we evaluated our replication of the hierarchical +Hi-VT5 model [451], as this model has the ability to decode text, understand +multipage layouts, and comprehend visual page features using DiT [259]. +Regarding factors (2) and (3), we evaluated models of various sizes ranging +from 131M (BigBird) to 175B (GPT-3-Davinci) and varied the input context +from 512 (BERT) to 20480 (Hi-VT5) tokens. Overall, we thoroughly evaluated +multiple models in the different testing setups to determine their performance +under various conditions, as seen in Table 5.3. + +5.5.2 + +Analysis & Discussion + +To summarize, our study reveals that existing advanced language models such as +BERT, Longformer, and BigBird struggle with comprehending visual elements +and document layouts. To address this issue, we introduced T5, T5-2D, and + + \ No newline at end of file diff --git a/assets/txts/pg_0167.txt b/assets/txts/pg_0167.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3190171313f2218f06d6978ba6d3f01158ec512 --- /dev/null +++ b/assets/txts/pg_0167.txt @@ -0,0 +1,335 @@ +DUDE BENCHMARK + +Model + +Init. + +135 + +Params + +Max Seq. +Length + +Test +Setup + +ANLSall ↑ + +ECEall ↓ + +AURCall ↓ + +ANLSdo + +ANLSdo +Abs + +ANLSdo +Ex + +ANLSdo +NA + +ANLSdo +Li + +131M +334M +148M + +4096 +512 +4096 + +Concat* +Max Conf.* +Concat* + +26.27 +25.48 +27.14 + +30.14 +34.06 +27.59 + +44.22 +48.60 +44.59 + +30.67 +32.18 +33.45 + +7.11 +7.28 +8.55 + +40.26 +42.23 +43.58 + +12.75 +5.88 +10.78 + +8.46 +11.13 +10.62 + +512 +512 +512 +8192 + +Concat-0* +Max Conf.* +Concat+FT +Concat+FT + +19.65 +29.48 +37.41 +41.80 + +19.14 +27.18 +10.82 +17.33 + +48.83 +43.06 +41.09 +49.53 + +25.62 +37.56 +40.61 +44.95 + +5.24 +21.19 +42.61 +47.62 + +33.91 +44.22 +48.20 +50.49 + +0 +0 +53.92 +63.72 + +7.31 +10.56 +16.87 +7.56 + +Concat-0 +Concat-4 +Concat-0 +Concat-4 + +- + +- + +- + +35.07 +41.89 +43.95 +47.04 + +16.73 +22.19 +18.16 +22.37 + +42.52 +49.90 +54.44 +57.09 + +70.59 +77.45 +73.53 +63.73 + +15.97 +17.74 +36.32 +40.01 + +3.49 +8.02 +5.43 + +text-only Encoder-based models +Big Bird +BERT-Large +Longformer + +MPDocVQA +MPDocVQA +MPDocVQA + +text-only Encoder-Decoder based models +T5 +T5 +T5 +T5 + +base +MPDocVQA +base +base + +223M +223M +223M +223M + +text-only Large Language models (LLM) +ChatGPT +GPT3 + +gpt-3.5-turbo + +20B + +4096 + +davinci3 + +175B + +4000 + +text+layout Encoder-Decoder based models +T5-2D +T5-2D +T5-2D + +base +base +large + +223M +223M +770M + +512 +8192 +8192 + +Concat+FT +Concat+FT +Concat+FT + +37.10 +42.10 +46.06 + +10.85 +17.00 +14.40 + +41.46 +48.83 +35.70 + +40.50 +45.73 +48.14 + +42.48 +48.37 +50.81 + +48.62 +52.29 +55.65 + +52.94 +63.72 +68.62 + +316M +125M + +20480 +512 + +Hierarchical+FT +Max Conf.* + +23.06 +20.31 + +11.91 +34.97 + +54.35 +47.51 + +22.33 +25.27 + +33.94 +8.10 + +17.60 +32.60 + +61.76 +8.82 + +6.83 +7.82 + +74.76 + +81.95 + +67.58 + +83.33 + +67.74 + +text+layout+vision models +HiVT5 +LayoutLMv3 +Human baseline + +MPDocVQA + +Table 5.3. Summary of Baseline performance on the DUDE test set (all ) and +diagnostic subset (do ). Test setups are defined as Max Conf.: predict one answer per +page and return an answer with the highest probability over all pages, Concat: predict +on tokens truncated to maximum sequence length, FT stands for fine-tuning on +DUDE training data, and -0 refers to zero-shot and -4 few-shot inference. Average +ANLS results per question type are abbreviated as (Abs)tractive, (Ex)tractive, +(N)ot-(A)nswerable, (Li)st. (*) We report only results for best performing test setup +(either Max Conf. or Concat). All scalars are scaled between 0 and 100 for readability. + +Hi-VT5 models that incorporate layout and visual information. Still, their +performance remains unsatisfactory, as evidenced by the comparison with the +human baseline, similar to what has been reported for InfographicsVQA. This +indicates that there is still scope for enhancing the visual understanding of +DUDE models. Moreover, our findings indicate that a large LLM capable of +processing long inputs alone is insufficient for achieving strong performance +in DUDE, especially for the extractive type of answer. Finally, the dataset’s +length significantly affects the models’ scores, as seen by the increase in scores +by 4.4 − 5.0 points when the T5 and T5+2D context length is extended from +512 to 8192. Similarly, the model size has a positive correlation with the final +score, but it holds only within a particular model-type and is not the main +factor influencing the results. State-of-the-art performance of 46.04 ANLSall +was achieved on T 5large with a 2D layout understanding that consumed 8192 +tokens, confirming the observation above. + + \ No newline at end of file diff --git a/assets/txts/pg_0168.txt b/assets/txts/pg_0168.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a6a0a2bf0566591acfc391f10fbfa9db58732c9 --- /dev/null +++ b/assets/txts/pg_0168.txt @@ -0,0 +1,54 @@ +136 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +5.6 + +Detailed Results Analysis + +5.6.1 + +Within Model Class Analysis + +5.6.1.1 + +Encoder vs. Decoder + +) + +A key difference between encoder-only and (encoder-) decoder-based models is +the ability to generate answers beyond the explicit document textual content. +This is clearly reflected in the results for BigBird, Longformer, BERT, and +LayoutLMv3, which score < 10 ANLS% on abstractive questions, whereas they +have just average scores for extractive questions. On DUDE, we can claim +that a generative model is necessary given all considered question types. +Quite remarkably, while the human baseline demonstrates that humans find +abstractive questions (ANLS ±82%) easier than extractive questions (ANLS +±68%), the reverse is true for all machine baselines. A potential confounder +for these results could be the difference in output formatting for extractive vs. +abstractive answers, which is hard to take into account with ANLS evaluation. +5.6.1.2 + +Incorporating Layout & Vision + +When comparing T5 with and without 2D position embeddings on the diagnostic +categories, we find the highest improvements on ‘evidence table or list’, +‘complexity simple’, and ‘evidence plain’. +Our study with the proposed baselines shows that questions requiring visual +evidence to be answered are an important future challenge for the vision +community. To get further insights into models’ performance on these questions, +we calculate a weighted average of ANLS over visual categories. This reveals +that GPT3 (4-shot) and T5-2d-large-8K obtain a tied score of (ANLS=37%), +even though they only have access to the text. The human performance, on the +other hand, is close to double (ANLS=72%), thus showing the need for better +integration of the visual modality in DU models. +5.6.1.3 + +Toward Long Document Processing + +DUDE clearly requires methods that can process long sequences, as evidenced +by its average document length of 1832 ± 2545 tokens. This is particularly +evident when comparing standard NLP QA methods like BERT-concat, which +underperforms Longformer [28] and BigBird [521], despite being the large version. + + \ No newline at end of file diff --git a/assets/txts/pg_0169.txt b/assets/txts/pg_0169.txt new file mode 100644 index 0000000000000000000000000000000000000000..1342d5feda87554165fb2275099307d2f4e3a3b0 --- /dev/null +++ b/assets/txts/pg_0169.txt @@ -0,0 +1,46 @@ +DETAILED RESULTS ANALYSIS + +137 + +Experiments with T5 and T5-2D further support this claim, as extending the +sequence length from 512 to 8192 leads to a ∼ 5% ANLS improvement. +The exception is HiVT5 [451], which performs worse than the rest of the +methods. This is due to the authors of HiVT5 performing a pretraining task of +text denoising that helped to better model the [PAGE] tokens. This resulted +in a better, compressed representation of the relevant information within a +document conditioned by a question. Moreover, the authors also did extensive +experimentation and found that 10 [PAGE] tokens per page were the best fit for +the MP-DocVQA [451] dataset. We used similar hyperparameters, but DUDE +might require better fine-tuning of [PAGE] tokens since the images are more +visually rich with colored graphics and layouts. The hierarchical processing of +documents with a meaningful visual component is a promising avenue for future +research. +5.6.1.4 + +Diagnosis of LLM Results + +The reasoning for including these LLMs as baselines stems from our question: +“Does advanced text understanding suffice for solving DUDE?". Our results +for diagnostic categories reveal some strengths and weaknesses of LLMs in the +DocVQA task setting. +Strengths GPT3 trumps all other tested models for list-type questions +(ANLS=36-40%), which can be explained by the extractive nature of these +questions. After 4-shot fine-tuning, ChatGPT (4-shot) is better than all other +tested baselines in answering not-answerable questions (ANLS=77.45%). This +can partly explain the appeal of this particular GPT checkpoint in recent times. +GPT3 (4-shot) outperforms (ANLS=52.51%) other tested baselines on questions +from the ‘complexity multi-hop’ category such as What city name appears the +most often in the timetables?. +Weaknesses Compared to another (more simple text-only generative baseline, +T5-base-512 (ANLS=47%), LLMs perform two times worse on abstractive +questions (ANLS=22%). Closer analysis reveals that LLMs (even after 4-shot +fine-tuning) predict abstractive questions to be Not-answerable in 55% of cases +(in reality: 10%). Operations such as arithmetic, counting, and comparisons +remain generally elusive skills (<25%ANLS). +Both LLMs we tested scored significantly lower than the human baseline in +questions that require visual understanding, with an average ANLS score of +21%. This is understandable because these are text-only models. +While LLMs’ zero-shot performance is relatively high, we note that DUDE +consists of public-license documents from the web, which potentially might have + + \ No newline at end of file diff --git a/assets/txts/pg_0170.txt b/assets/txts/pg_0170.txt new file mode 100644 index 0000000000000000000000000000000000000000..03b018edca6345329b09f4f5bd46be0bcaf04acf --- /dev/null +++ b/assets/txts/pg_0170.txt @@ -0,0 +1,49 @@ +138 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +been included in the LLMs’ pretraining corpus. + +5.6.2 + +Assessing Confidence + +ECE measures calibration of confidence, whereas AURC assesses both +performance and confidence ranking [193] (more detail Section 2.2.3). The +latter results in an appropriate metric to select the best model in real-world +applications, where wrong predictions can yield undesired scenarios, which could +be prevented by manually revising low-confidence answers. +Interestingly, T5-base-512 scores better on calibration (ECE=10.82) than T5-2Dlarge-8K, the baseline with the highest ANLS, yet worse calibration (ECE=14.4). +In general, it seems calibration worsens when extending the maximum sequence +length, whereas adding 2D position embeddings only positively affects ANLS. +From the baselines tested, T5-2D-large-8K achieves the highest AURC. +Another interesting result comes from analyzing the calibration of models +evaluated using the Concat strategy vs. Max Conf. strategy. In the main paper, +we reported results for the model with the relative best ANLS. Thanks to our +varied set of evaluation metrics, we discover that Max Conf. overall results in +poor calibration (see Table 5.4), whereas considering ANLS, there is not always +a clear winning strategy. This shows that predicting each page separately and +necessarily assuming conditional independence across pages is not a reliable +strategy for multipage DocVQA. + +5.7 +5.7.1 + +DUDE Competition Results +Submitted Methods + +Overall, 6 methods from 3 different participants were submitted for the proposed +tasks in the DUDE competition. To avoid cherry-picking from considering all +submissions of individual participants, we consider only the last submission +(accentuated) for the final ranking. All the methods followed an encoder-decoder +architecture, which is a standard choice for VQA when abstractive questions are +involved. Specifically, the submitted methods are mostly based on T5-base [383] +as the decoder. For this reason, we include the T5-base baseline to compare how +the participant methods improved on it. A short description of each method +can be found in Table 5.5. +Two very recent state-of-the-art architectures, UDOP and HiVT5, have been +extensively leveraged by participants. The former is geared toward improved + + \ No newline at end of file diff --git a/assets/txts/pg_0171.txt b/assets/txts/pg_0171.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfeabc09b2f62326a8b4bbcaacdefa4b53a48eba --- /dev/null +++ b/assets/txts/pg_0171.txt @@ -0,0 +1,110 @@ +DUDE COMPETITION RESULTS + +139 + +Model + +ANLS + +ECE + +AURC + +BertQA MPDocVQA Concat +BertQA MPDocVQA MaxConf + +29.8 +32.18 + +13.83 +28.93 + +43.28 +48.73 + +BigBird MPDocVQA Concat +BigBird MPDocVQA MaxConf + +30.67 +29.38 + +25.07 +50.79 + +47.2 +56.81 + +LayoutLMv3 MPDocVQA Concat +LayoutLMv3 MPDocVQA MaxConf + +22.61 +25.27 + +13.19 +31.31 + +57.11 +58.54 + +Longformer MPDocVQA Concat +Longformer MPDocVQA MaxConf + +33.45 +28.67 + +22.21 +48.6 + +45.83 +58.11 + +T5 MPDocVQA Concat +T5 MPDocVQA MaxConf + +34.37 +37.56 + +18.97 +23.73 + +47.31 +46.69 + +T5-base Concat-0 +T5-base MaxConf-0 + +25.62 +22.21 + +20.05 +39.47 + +62.25 +58.89 + +Table 5.4. Comparison of baselines using Concat or Max Conf strategies. + +document page representations, while the latter targets multipage document +representations. In their method reports, the UDOP-based models by Lenovo +Research mention calculating confidence by multiplying the maximum softmax +score of decoded output tokens with two additional post-processing rules: a) +predicted not-answerable questions confidence is set to 1, b) when abstaining, +confidence is set to 0. + +5.7.2 + +Performance Analysis + +Table 5.6 reports the competition results ranking comparing the submitted +methods’ performance on the test set. Higher ANLS and AUROC values +indicate better performance, while lower ECE and AURC values signify +improved calibration and confidence ranking. According to the findings, the +UDOP+BLIP2+GPT approach attains the highest ANLS score (50.02), achieving +the best calibration and OOD (out-of-distribution) detection performance. In a +direct comparison of the MMT5 and HiVT5+modules methods, the former shows +a higher ANLS score, yet did not provide any confidence estimates. +Thus, the overall winner is UDOP+BLIP2+GPT by Lenovo Research. Their +submitted methods (ranked by highest ANLS) also differentiate themselves by +their additional attention to confidence estimation. Based on the numbers in + + \ No newline at end of file diff --git a/assets/txts/pg_0172.txt b/assets/txts/pg_0172.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a5446cfd49c077afb1a0a5c04a1c74e6eab5c9 --- /dev/null +++ b/assets/txts/pg_0172.txt @@ -0,0 +1,68 @@ +140 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +Method + +Description + +T5-base +(ours) + +T5-base [383] fine-tuned on DUDE (AWS OCR), with a delimiter +combining list answers into a single string, and replacing notanswerable questions with ’none’. + +) + +Lenovo Research +UDOP(M) +Ensemble (M=10) of UDOP [443] (794M each) models without +self-supervised pretraining, only fine-tuned in two stages: 1) SPDocVQA [450] and MP-DocVQA [451], and 2) DUDE (switching +between Azure and AWS OCR). +UDOP ++BLIP2 + +UDOP(M=1) with integrated BLIP2 [260] predictions to optimize +the image encoder and additional page number features. + +UDOP ++BLIP2+ +GPT + +UDOP(M=1) and BLIP2 visual encoder with ChatGPT to +generate Python-like modular programs to decompose questions +for improved predictions [160, 437]. + +Upstage AI +MMT5 + +Infrrd.AI +HiVT5 + +HiVT5 ++modules + +Multimodal T5 pretrained in two stages: single-page (ScienceQA +[403], VQAonBD2023 [385], HotpotQA [508], SP-DocVQA) with +objectives (masked language modeling (MLM) and next sentence +prediction (NSP)), multipage (MP-DocVQA and DUDE) with +three objectives (MLM, NSP, page order matching). Fine-tuning +on DUDE with answers per page combined for final output. +Hi-VT5 [451] with 20 tokens pretrained with private +document collection (no information provided) using span masking +objective [204]. Fine-tuned with MP-DocVQA and DUDE. +Hi-VT5 extended with token/object embeddings for a variety +of modular document understanding subtasks (detection: table +structure, signatures, logo, stamp, checkbox; KIE: generic named +entities; classification: font style). + +Table 5.5. Short descriptions of the methods participating in the DUDE competition, +in order of submission. The last submitted method is considered for the final ranking. + +the table, several interesting observations can be made to support the suggested +future directions and propose additional experiments: +• ANLS. The integration of UDOP, BLIP2, and ChatGPT contributes to the +method’s superior overall performance in answering different question +types. + + \ No newline at end of file diff --git a/assets/txts/pg_0173.txt b/assets/txts/pg_0173.txt new file mode 100644 index 0000000000000000000000000000000000000000..e25724c2e19ddb4d937dc38656fe5300a72396c8 --- /dev/null +++ b/assets/txts/pg_0173.txt @@ -0,0 +1,99 @@ +DUDE COMPETITION RESULTS + +Answer +Method +UDOP+BLIP+GPT +MMT5 +HiVT5+modules + +141 + +Calibration + +ANLS / answer type + +OOD Detection + +ANLS ↑ + +ECE ↓ + +AURC ↓ + +AUROC ↑ + +Ex + +Abs + +Li + +NA + +50.02 +37.90 +35.59 + +22.40 +59.31 +28.03 + +42.10 +59.31 +46.03 + +87.44 +50.00 +51.24 + +51.86 +41.55 +30.95 + +48.32 +40.24 +35.15 + +28.22 +20.21 +11.76 + +62.04 +34.67 +52.50 + +Table 5.6. Summary of Method performance on the DUDE test set. Average ANLS +results per question/answer type are abbreviated as (Abs)tractive, (Ex)tractive, +(N)ot-(A)nswerable, (Li)st. (*) All scalars are scaled between 0 and 100 for readability. + +• ECE, AURC. Integrating UDOP, BLIP2 visual encoder, and ChatGPT +for question decomposition contributes to the method’s performance in +handling uncertainty across various question types. +• Abstractive. The top performance of UDOP+BLIP2+GPT in abstractive +questions reveals the potential of combining the UDOP ensemble, BLIP2 +visual encoder, and ChatGPT to enable abstract reasoning and synthesis +of information beyond simple extraction. +• List. The performance of UDOP+BLIP2+GPT in list-based questions +suggests that incorporating page number features can enhance the model’s +capability to process and generate list information, which might be spread +across pages. +Figure 5.8 visualizes an overview of the performance of each submitted method +respective to diagnostic subset samples matching a certain diagnostic category. +The models generally struggle with operations involving counting, arithmetic, +normalization, and comparisons. As expected, models have higher performance +when dealing with simpler questions (complexity simple) compared to more +complex questions (complexity multi-hop, complexity other hard, and complexity +meta). Models tend to perform better when handling evidence in the form of +plain text (evidence plain) compared to other forms of evidence, such as visual +charts, maps, or signatures. Performance across models is notably lower for +tasks involving lists compared to other question types. Models show varying +performance when dealing with different types of forms (e.g., date, numeric, +other, proper). +Figure 5.10 studies the ability of the competitors’ methods to answer questions +respective to increasingly longer documents. We observe a significant drop +in ANLS when aggregating scores over gradually longer documents. This is +expected as the longer the document is, the more probable that the answer will +either be located on a later page or rely on a long-range dependency between +the tokens (e.g., a multi-hop question). Strikingly, all methods’ scores, except + + \ No newline at end of file diff --git a/assets/txts/pg_0174.txt b/assets/txts/pg_0174.txt new file mode 100644 index 0000000000000000000000000000000000000000..621a5307ed350b1dc5a0502522daad0c6226f84c --- /dev/null +++ b/assets/txts/pg_0174.txt @@ -0,0 +1,17 @@ +142 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +Figure 5.8. We report the average ANLS per diagnostic category for each of the +submitted methods vs. human and a baseline method T5-base. Since the diagnostic +dataset contains a different number of samples per diagnostic category, we added error +bars representing 95% confidence intervals. This helps visually determine statistically +significant differences. + +Hi-VT5+modules, drop significantly for questions on 2-page documents. This is +likely to have the root cause in the standard input size of T5-based methods +equal to 512 tokens, covering roughly 1 page. + + \ No newline at end of file diff --git a/assets/txts/pg_0175.txt b/assets/txts/pg_0175.txt new file mode 100644 index 0000000000000000000000000000000000000000..c28d5244f0d7619f8f5aff6de0ab6f39a895c4a3 --- /dev/null +++ b/assets/txts/pg_0175.txt @@ -0,0 +1,21 @@ +DUDE COMPETITION RESULTS + +143 + +Figure 5.9. A histogram (bins=8, matching ANLS-threshold of 0.5) of the average +ANLS rate per QA pair when summing ANLS scores over competitor methods. + +Figure 5.10. Left: A histogram over the number of questions relative to the number of +pages in the document (limited to 20 pages). Right: A line plot of the average ANLS +score per QA pair: – documents of length at least (x-axis) pages. + +Figure 5.9 analyzes the correlation of errors over competitor methods. A large +portion of QA pairs is predicted completely wrong (ANLS-rate = 0) by all +competitor methods. This can have many plausible causes: a) by all sharing +a similar decoder (T5), methods suffer from similar deficiencies, b) some QA +pairs are too complex for current SOTA competitor methods, particularly +questions requiring more complex reasoning or unique document-specific layout +processing. To further analyze this phenomenon, we sample qualitative examples +with different ANLS rates (Appendix B.1). + + \ No newline at end of file diff --git a/assets/txts/pg_0176.txt b/assets/txts/pg_0176.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d940e92588d27690bd174b516b049ffafd860c3 --- /dev/null +++ b/assets/txts/pg_0176.txt @@ -0,0 +1,46 @@ +144 + +5.8 + +DOCUMENT UNDERSTANDING OF EVERYTHING (DUDE + +) + +Chapter Conclusion + +In conclusion, this chapter introduces a new large-scale multipaged, multidomain, multi-industry Document Visual Question Answering Benchmark for +document understanding. Our dataset is adjusted to the real-world environment +where we need to process long documents and understand different types of +documents. The benchmark includes visual semantics such as tables, charts, +figures, lists, checkboxes, stamps, and more, which are essential for real-world +document understanding. The performance of SOTA textual and multimodal +models still lags behind human performance, indicating the need for further +improvement in visual understanding for DU models. Nevertheless, we believe +evaluating systems on DUDE could inspire new architectures and methods. +Limitations. As our approach is closer to real-world industrial applications, +and enables models to recognize and understand new unseen data without +the need for re-training, it does come with some limitations and constraining +factors, including the use of only English language documents. Future work +could address these limitations and expand the benchmark to include other +languages. Moreover, although our dataset can be considered large-scale, it still +represents a relatively small sample size of the plethora of documents that exist +in the real world. +As a core contribution of DUDE, we wanted to emphasize the importance of +evaluation beyond mere predictive performance. DUDE offers an interesting +and varied test bed for the evaluation of novel calibration and selective QA +approaches (e.g., [96, 273]). While this was not explicitly attempted in this +iteration of the competition, we hope that future work will consider testing +their methods against DUDE. +Future of the Shared Task As the competition evolves, we hope that DUDE +will serve as an essential platform for pushing the frontiers of research and +driving innovation in the DU field. Currently, our competition focuses on +English language documents, which means we miss out on the potential of +incorporating multilingual data. An ideal extension for future iterations of the +shared task would be to introduce multilingualism, which our framework can +accommodate, provided that source documents are readily available. However, +this would also require specifying language qualifications for annotation experts. +Moreover, one could automate part of the data collection process and annotation +process by allowing the best-performing competition system to validate the +aptitude and complexity of human-proposed QA pairs. + + \ No newline at end of file diff --git a/assets/txts/pg_0177.txt b/assets/txts/pg_0177.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee6542242eacdb59b7f8540674d1ba69b87ff6d4 --- /dev/null +++ b/assets/txts/pg_0177.txt @@ -0,0 +1,3 @@ +145 + + \ No newline at end of file diff --git a/assets/txts/pg_0178.txt b/assets/txts/pg_0178.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7242c23756073c1867d11150d90fa5a6a3a2f0b --- /dev/null +++ b/assets/txts/pg_0178.txt @@ -0,0 +1,30 @@ +146 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +Chapter 6 + +DistilDoc: Knowledge +Distillation for Visually-Rich +Document Applications +The contents of this chapter come from a publication under review at CVPR +2024 [471]: +Jordy Van Landeghem, Subhajit Maity, Ayan Banerjee, Matthew B Blaschko, +Marie-Francine Moens, Josep Llados, and Sanket Biswas. DistilDoc: Knowledge +Distillation for Visually-Rich Document Applications. In Proceedings of the +IEEE/CVF Conference on Computer Vision and Pattern Recognition (under +review), 2024 +This is an external collaboration with Subhajit Maity, Ayan Bannerjee, Josep +Llados, and Sanket Biswas. The work was conceived during a research visit at +the Computer Vision Center in Barcelona, Spain. +Disclosing the work done by the authors other than supervisors: +• Jordy Van Landeghem created the project’s scope, implemented and +performed all DIC and downstream DocVQA experiments, including +training DLA teacher models, connecting the DLA inference and +evaluation, and wrote the manuscript with supplementary. +• Subhajit Maity and Ayan Bannerjee built the DLA architectures and +performed the DLA-KD experiments. +• Sanket Biswas brought the team together and helped with related work +and the introduction. + + \ No newline at end of file diff --git a/assets/txts/pg_0179.txt b/assets/txts/pg_0179.txt new file mode 100644 index 0000000000000000000000000000000000000000..a81fff9b08d66fd906bf382452650faff647f9a5 --- /dev/null +++ b/assets/txts/pg_0179.txt @@ -0,0 +1,46 @@ +INTRODUCTION + +147 + +This chapter focuses on efficiency via knowledge-distillation (KD) model +compression for document understanding (DU) tasks. While DU research is +dependent on increasingly sophisticated and cumbersome models, the field has +neglected to study efficiency via model compression, referring to any technology +transforming large and complex models into smaller streamlined models with +similar performance [548]. Here, we design a KD experimentation methodology +for more lean, performant models on DU tasks that are integral within larger +task pipelines, specifically document image classification (DIC) and document +layout analysis (DLA). +We carefully selected KD strategies (response-based, feature-based) for distilling +knowledge to and from backbones with different architectures (ResNet, ViT, +DiT) and capacities (base-small-tiny). We study what affects the teacherstudent knowledge gap and find that some methods (tuned vanilla KD, MSE, +SimKD with an apt projector) can consistently outperform supervised student +training. Furthermore, we design a downstream task setup to evaluate the +robustness of distilled DLA models on zero-shot layout-aware document visual +question answering (DocVQA). +DLA-KD experiments result in a large mean average precision (mAP) knowledge +gap, which unpredictably translates to downstream robustness, accentuating +the need to further explore how to efficiently obtain more semantic document +layout awareness. +This chapter motivates the need for more efficient DU models, especially for +VRD tasks, and provides a benchmarking framework for future research on KD +for DU tasks. Additionally, it motivates being smart about when to use which +modality when the downstream task has a certain modality-bias (e.g., DocVQA +is a text-centric task, whereas DLA is more vision-centric). Finally, it links to +efforts in DUDE to use LLMs for DU, with the focus here on incorporating +layout information from distilled DLA models into the LLMs. + +6.1 + +Introduction + +Visually-rich Document Understanding (DU) has attracted increasing interest +over the last few years. It involves multiple tasks such as document image +classification (DIC) [165, 195, 210, 284], key information extraction (KIE) [197, +272, 296, 422, 433], document layout analysis (DLA) [35, 36, 80, 362, 544] and +document visual question answering (VQA) [100, 309, 310, 450]. Current SOTA +DU models [153, 187] solve the task by using modern OCR engines to read the +text and then combine them with spatial features to predict the page layout and +structure. However, these multimodal architectures come with the following + + \ No newline at end of file diff --git a/assets/txts/pg_0180.txt b/assets/txts/pg_0180.txt new file mode 100644 index 0000000000000000000000000000000000000000..86bb331e648b07690b7a1ba97f9f80d78a6eb19c --- /dev/null +++ b/assets/txts/pg_0180.txt @@ -0,0 +1,112 @@ +148 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +OCR + +"CONTENTS", "IFMA +Objectives", "Page 2", +"IFMA Officers and +Board of Directors", ... + +what are the +contents in +page 2? + +Text Tags + +Question Prompt + +Large +Baseline +Encoder + +OCR + +Ground-Truth + +Knowledge +Distillation + +"IFMA Objectives" +Text Tags + DLA Tags + +B +DLA + +Layout Aware +OCR + +"
", +"CONTENTS", "", "IFMA +Objectives", "Page 2", "IFMA +Officers and Board of Directors", ... + +DLA + OCR + +KD A + +Practical: +Robust: + +LLM Decoder + +Small +Student +Encoder + +A + +... + +... + +Efficient: +DocVQA + +KD B +KD A , KD B + +... + +mAP +params size +GFLOPS + +KD A + DLA + OCR + +KD B + DLA + OCR + +im/s (throughput) +ANLS +Explorative Analysis + +Figure 6.1. DistilDoc presents the first framework to investigate the potential of +KD-based DLA model compression to enrich LLM prompts with logical layout +structure to practically and efficiently improve downstream applications such as +DocVQA. + +drawbacks: 1) They rely primarily on Large Language Models (LLMs) [542] +pretrained on millions of samples which depend more on OCR text quality than +visual features/document structure; 2) can be computationally heavier due to +the need to process and fuse information from different modalities; and 3) may +perform poorly in domains with poor OCR results or on low-resource languages. +Therefore, this work focuses on single-modality, vision-only architectures that +can be finetuned for handling VRDs in tasks involving understanding visuallayout semantics such as tables, titles, paragraphs, figures, etc. DLA is a useful +preliminary step in a document processing workflow [35, 80], holding the key +to enhancing practical downstream DU tasks such as DIC, KIE, and VQA. +DLA can impart logical layout structure, beyond geometric layout from OCR +[164], and structured context to the document, to enable more accurate content +extraction and interpretation. A recent DU competition [469] has pleaded to +bridge the gap between DLA and DocVQA by introducing layout-navigating or +multi-region questions. +To handle the computational demand of modality/task-specific models, +knowledge distillation (KD) [21, 150, 178, 394] can prove an effective approach +to obtain efficient modules for later re-use in enriching LLM document inputs. +Teacher model compression has the potential to make student models that +improve over direct finetuning, also making them practical for deployment +with resource-constrained devices or for faster real-time inference. The field of +Document AI [79] is engaged with representing and understanding VRDs, but +thus far has not explored KD-based model compression for improved efficiency + + \ No newline at end of file diff --git a/assets/txts/pg_0181.txt b/assets/txts/pg_0181.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba6dd475cbfba13a5d1a8a74d2ec3fe0e05458b0 --- /dev/null +++ b/assets/txts/pg_0181.txt @@ -0,0 +1,43 @@ +RELATED WORK + +149 + +and uncertainty estimation [126]. +This work investigates the potential of enriching VRDs with logical layout +structure derived from effective DLA model compression using KD methods to +practically and efficiently improve downstream DU applications. The nature +of the (document) dataset has a major impact on the KD process [434], which +required motivated choices (regarding dataset usage [14, 165, 362], architectures, +weight initialization [259], KD methods [63, 67, 170, 178, 183, 540], evaluation, +downstream procedure [482], etc.) in designing our experimental methodology +of KD benchmarking for DU tasks (DIC, DLA). This allows us to investigate +aspects affecting teacher-student knowledge/capacity/initialization gaps. +The key contributions of the paper are twofold: +I. We are the first to design, apply, and open-source an experimental methodology +for comprehensively benchmarking KD-based model compression on DU tasks +involving VRDs (DIC and DLA). +II. We design a novel evaluation procedure based on the downstream task of +zero-shot layout-aware DocVQA to quantify the robustness of distilled DLA +models. +Nevertheless, our contributions go beyond mere KD-based compression +benchmarking, promoting logical layout analysis over geometric layout to +enhance the generalization of DU models toward unseen documents with diverse +and complex layouts, as demonstrated in Figure 6.1. + +6.2 + +Related Work + +Efficiency and Model Compression Efficiency through model compression +is gaining relevance with the increasing parameter size and complexity of +models such as LLMs [556]. Although KD is a prominent technique for model +compression, several alternative approaches are worth mentioning. Quantization +has been recently re-discovered in the context of LLMs with LoRA [184] and QLoRA [93] that achieves substantial model compression with minimal accuracy +degradation. Advances have been made also in vision-and-language [57, 518] +and more recently for vision transformer (ViT) training [269]. However, its +effectiveness also depends on some key factors, including the model architecture, +data type, bit-width, and the training recipes employed. In this direction, neural +architecture search (NAS) became an important field of study [55, 279, 280, 363]. +Popular alternatives include model weight pruning [131, 288, 554] that benefits + + \ No newline at end of file diff --git a/assets/txts/pg_0182.txt b/assets/txts/pg_0182.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5c951597058d2fedfbf11acd36b1d9fd49f9846 --- /dev/null +++ b/assets/txts/pg_0182.txt @@ -0,0 +1,43 @@ +150 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +strongly from joint usage with other efficiency and model compression techniques; +adaptive inference with multi-exit architectures [501, 547], which are promising +yet highly dependent on early exit network design and uncertainty estimation. +KD-based training [364] complements the aforementioned techniques, leading +to potentially more accurate model exits and pruning. Moreover, KD strategies +involve overall simpler design choices, depending mostly on the availability of a +large teacher model trained on domain data of interest. Therefore, we prioritize +KD-based model compression and efficiency for practical DU applications. +Knowledge Distillation KD strategies can be categorized into three main +categories: response-based KD [6, 21, 178, 314, 509, 541] seeks to match the final +layer predictions of the teacher model; feature-based KD [8, 62, 67, 175, 221, 394] +aims to mimic features extracted from intermediate hidden layers of the deep +network and relation-based KD [355, 356, 447, 511] which exploits the relations +between different layers or sampled data points. However, the latter approach +is more geared toward pixel-based semantic segmentation tasks. While featurebased KD is more versatile, it is more expensive and harder to implement +than soft teacher predictions. While offline methods [178, 394] consider an +existing frozen teacher model, online methods [61, 538] update both student +and teacher networks jointly. Self-distillation [22, 528] represents a special case +of online KD, which employs the same network as both the teacher and student, +progressively outperforming the network’s performance, albeit disregarding the +aim of efficiency. +Our work’s scope will be offline KD schemes, with a single converged teacher +(vs. intermediate checkpoints [479] or ensembles [515]), single modality inputs +(vision only), with three different feature extraction backbones (ResNets, ViT +and a self-supervised pretrained document foundation model DiT [259]). Our +study seeks to extend the empirical utility of KD to popular DU tasks (DIC & +DLA) with a versatile benchmarking framework to ensure future compatibility, +fostering KD-based DU model compression research. +Practical and Efficient Document Understanding Recent efforts to represent +layout and document structure have gained substantial recognition, particularly +with the incorporation of structural information into LLMs. The LayoutLM +family [187, 502, 503] and GeoLayoutLM [296] laid the foundation of using 2D +positional information of text (word blocks) tokens obtained from OCR as a +geometric layout representation for the input. Recent work [416] has further +enhanced this 2D representation by incorporating text lines or text blocks +as layout groups inside the OCR text tokens. [482] further experiment with +structure-preserving OCR, that uses appropriate spaces and line breaks as an + + \ No newline at end of file diff --git a/assets/txts/pg_0183.txt b/assets/txts/pg_0183.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b0a7e906b7cb71d7f5e4d0757892cd7908f88ea --- /dev/null +++ b/assets/txts/pg_0183.txt @@ -0,0 +1,113 @@ +EXPERIMENTAL SETUP + +Pre-​trained checkpoint + +151 + +Teacher pre-​training Teacher fine-​tuning + +Student fine-​tuning + +Student checkpoint + +Downstream evaluation + +optional +ViT-​B + +ResNet-101 + +Sup ImageNet-1K (1.3M) + +ViT-​S/T + +Sup + +RVL-​CDIP (400K) + +DiT-​B + +SS + +IIT-​CDIP (11M) + +Sup + +PubLayNet (360K) + +ResNet-50 + +Tobacco-3482 + +Tobacco-3482 + +random + +RVL-​CDIP + +RVL-​CDIP_[1K] + +ViT-​S/T + +PRImA + +PRImA + +DocLayNet + +DocLayNet + +RVL-​CDIP-​N + +DocVQA +InfographicsVQA +Q: How many positive samples of Influenza +A H1 pathogen were detected in DoD +beneficiary? + +Sup ImageNet-1K (1.3M) + +KD methods +Note + +Note + +Note + +Response-​based +Feature-​based + +Figure 6.2. Proposed experimental methodology to comprehensively study all +aspects (left-to-right) that impact KD methods (response, feature; projectors) adapted +for VDU task specifics (architecture, weight initialization, pretraining & finetuning +datasets, student capacity). Downstream setups evaluate the robustness of distilled +students. + +LLM input, thereby improving the ability to capture layout and structural cues +for zero-shot DocVQA [309, 310] tasks. [153, 263] seek to represent layout as +region-level proposal features, representing logical layout elements like title, +paragraph, figure, tables, etc.) as in the DLA task. To further study the utility of +logical layout representations, [498] address asking questions conditioned inside +a specific region of a page, improving upon the design of DocVQA that provides +too many in-line questions (>80%). More recently, PDFTriage [400] generates a +structured metadata representation of born-digital documents, extracting both +geometric and logical layout elements like section text, figure captions, headers, +and tables for a more precise QA approach. DUDE [468] offers a testing bed for +DocVQA on multipage, multi-type documents with varying layouts, including +questions conditioned on layout navigation, e.g., ‘Which pages have tables?’. +Our explorations focus on making the most of the logical layout features obtained +from the multi-domain DLA benchmark, DocLayNet [362]. We build upon +the aforementioned advancements and explore how incorporating document +structure can enhance the performance of downstream task models, aligning +with the trend of enriching LLMs with rich-text prompting and layout-aware +representations. + +6.3 + +Experimental Setup + +This Section documents the experimental methodology established in this work +as visualized in Figure 6.2, including datasets, architectures and backbones +for teacher and student models, KD methods, and evaluation metrics for the + + \ No newline at end of file diff --git a/assets/txts/pg_0184.txt b/assets/txts/pg_0184.txt new file mode 100644 index 0000000000000000000000000000000000000000..a62d9d15a7b71612b93b6321ec96548be236f205 --- /dev/null +++ b/assets/txts/pg_0184.txt @@ -0,0 +1,91 @@ +152 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +tasks and distillation effectiveness. The goal is to provide a framework for +future research on KD for DU tasks and allow pinpoint comparisons on KD +aspects such as teacher-student knowledge and capacity gap, teacher-pretraining, +student network initialization, etc. +Table 6.1. Dataset usage for DIC, DLA, and downstream tasks. Symbols: P = +pretraining, DP = document pretraining, T = teacher training, S = student training, +* = subsampling, E = teacher/student evaluation, D: downstream evaluation + +Dataset +ImageNet [90] +IIT-CDIP [252] +Tobacco-3482[232] +RVL-CDIP[165] +PRImA[14] +DocLayNet[362] +RVL-CDIP-N [241] +SP-DocVQA [450] +Infographic [310] + +6.3.1 + +Task +DIC +DIC +DIC +DIC +DLA +DLA +DIC +VQA +VQA + +Usage +P +DP,T,S +T,S,E +DP,T,E +T,S,E +T,S,E +D +D +D + +Size +1.28M +11M +3482 +400K +400 +80.8K +1K +12.8K +5.5K + +# Cls +1000 +/ +10 +16 +6 +11 +12 +50K +30K + +Datasets + +Tab. 6.1 lists all datasets used (in)directly for the experiments. As there is +no existing methodology for KD experimentation on the tasks involved, we +motivate the design choices: +DIC We benchmark results on both Tobacco-3482 (original train-val-test splits +800-200-2482) and RVL-CDIP. The originally large training size of RVL-CDIP +hinders experimentation (long iteration cycles), which is why we create a +subsampled student training set, RVL-CDIP1k , by randomly selecting 1K images +per class. By evaluating the full RVL-CDIP test set, we provide a fair evaluation +of the usefulness of KD methods, while avoiding the cumbersomeness of student +finetuning on such a large dataset. +While RVL-CDIP is the de facto standard for measuring performance on +the task of document classification, the literature [242, 470] has reported +several undesirable characteristics such as (near-)duplicates causing substantial +overlap between train and test distributions. We complement independently +and identically distributed (i.i.d.) test set evaluation with benchmarking on +RVL-CDIP-N [241], which is a covariate shift dataset allowing us to evaluate +the robustness of KD methods to domain shift, which is a common problem in +real-world applications. + + \ No newline at end of file diff --git a/assets/txts/pg_0185.txt b/assets/txts/pg_0185.txt new file mode 100644 index 0000000000000000000000000000000000000000..caeb6bf1cf13e65f1d52ae46fde5ee122bf6ae73 --- /dev/null +++ b/assets/txts/pg_0185.txt @@ -0,0 +1,41 @@ +EXPERIMENTAL SETUP + +153 + +DLA We benchmark results on DocLayNet (reporting evaluation on validation +set following common practice) and PRImA. The former is a large-scale humanannotated dataset with 81K images and 11 categories of logical layout elements, +while the latter is a smaller dataset with 400 images and 6 classes. DocLayNet +contains a wide layout variability with six diverse document types (patents, +scientific, legal, reports, tenders) in English. They have been hand-annotated +by trained experts, making it the gold standard for DLA. Alternatively, +Publaynet [544] or MS-COCO [274] benchmarks have been used in pretraining +DLA models. However, the former lacks diversity as it only contains documents +from the scientific domain while the latter is a more common object detection +benchmark for natural scenes. +We consider a mirrored data setup for both tasks, with one larger benchmark +dataset (RVL-CDIP, DocLayNet) and a smaller, easier dataset (Tobacco-3482, +PRImA). This allows us to compare KD efficacy with more or less accurate +teachers over tasks. + +6.3.2 + +Architectures and Backbones + +We evaluated three backbone architectures, representing different approaches +to the tasks of DIC and DLA. +Backbones Residual Network (ResNet) [167]: A supervised pretrained CNNbased architecture that is a staple in image recognition. +Vision Transformer (ViT ) [101]: A supervised pretrained Transformer-based +architecture that is effective for a variety of CV tasks. +Document Image Transformer (DiT ) [259]: A self-supervised pretrained +architecture specifically designed for DU tasks, as it was pretrained on 11M +document images from IIT-CDIP with a Masked Image Modeling objective, as +inspired by BeiT [24]. +Specific to DLA, we use the Mask R-CNN [168] meta-architecture for instance +segmentation with two different backbones, i) classic ResNets and ii) ViT, with +the latter more challenging to integrate [267]. +Historically, CNNs have been more popular for DLA due to their accuracy, +speed, and multiple optimizations built into the meta-architectures (involving +a backbone, neck, and head). However, recent work is pointing to the +potential of ViT as plain (non-hierarchical) object detectors [268]. Compared + + \ No newline at end of file diff --git a/assets/txts/pg_0186.txt b/assets/txts/pg_0186.txt new file mode 100644 index 0000000000000000000000000000000000000000..956410aa0607682ada75cf2fdb3741bf8b5945d2 --- /dev/null +++ b/assets/txts/pg_0186.txt @@ -0,0 +1,37 @@ +154 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +to Transformers, CNNs have strong inductive biases of translation equivariance +and locality, a fundamental difference that is less explored in a KD context +[33]. + +Network Architecture and Initialization Document images are very different +from natural images, yet most available vision backbones of different sizes are +pretrained on the latter, except for DiT. Nevertheless, ViTs seem to struggle to +learn a function when starting from random initialization, both as teachers and +student networks. Therefore, we will use ImageNet pretrained checkpoints for +all models considered, even for student network initialization. + +Teacher Models While there are many model variants with different capacities +for each of the backbones (Tab. D.1), we opt for the Base variant for +Transformers, which arguably is most common. We consider ResNet-101 as it +has the attractive property of having similar hidden layers’ output dimensionality +as the next smaller variant, ResNet-50. +The comparison of ViT-B and DiT-B allows us to evaluate the effects of different +pretraining schemes (supervised, self-supervised) and how this affects knowledge +transfer. +Student Models For DIC, we consider ViT-small and ViT-tiny, as well as a +CNN-based architecture (ResNet-50), whereas, for DLA, we consider MaskRCNN with a Resnet-50 backbone and a ViT-tiny backbone. Due to the +computational demand of training instance segmentation models, we only +consider the ViT-tiny backbone for the student model, therefore not making +it possible to analyze KD methods for an increasing teacher-student capacity +gap. While it would have made an interesting comparison, DiT has not been +released in a smaller variant than DiT-B, and given the computational demand of +pretraining DiT on the entire IIT-CDIP dataset containing 42 million document +images, we did not consider it for student training. One might regard the +knowledge transfer of DiT-B to a smaller ViT-(S/T) as potentially resulting in +DiT-(S/T), yet the ImageNet or random initialization of the student network +differs substantially from that of the self-supervised DiT weight space. + + \ No newline at end of file diff --git a/assets/txts/pg_0187.txt b/assets/txts/pg_0187.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8ea714883c7560663d12dfe67dcacdf97acd338 --- /dev/null +++ b/assets/txts/pg_0187.txt @@ -0,0 +1,57 @@ +EXPERIMENTAL SETUP + +6.3.3 + +155 + +KD Methods + +The basic approach of knowledge distillation consists of transferring ’knowledge’ +from a cumbersome teacher model f t to a lightweight student model f s , where +f : X → ∆Y is a function mapping input data X and outputting a conditional +probability distribution P (y 0 |x) over output labels y 0 ∈ Y = [K] for K classes +[368]. When this model compression approach is done effectively, the student +model will be more efficient in terms of memory and computation. The top-1 +class prediction is ŷ = argmaxy0 ∈Y [f (X)]0y , with p̂ = maxy0 [f (X)]0y the posterior +probability. For convenience, [f˜(x)]k denotes the k-th element of the logits + +vector f˜(x) ∈ RK , which when normalized with softmax f (x) = σ f˜(x) = +exp(f˜(x)/τ ) +. Let each function f be parameterized by θ holding all +PK +˜ +k=1 exp([f (x)]k /τ ) +trainable parameters of the function, separable into a variable L layers, where +fl (x) denotes the l-th layer output, e.g., the penultimate layer output fL−1 (x). +While there exists a wealth of ever-growing KD methods, we have carefully +chosen a combination of simplistic methods mimicking the basic principles +of KD (i, iv), more advanced KD methods that target specific improvements +such as penalizing the non-target class logits (ii), or distilling the knowledge of +intermediate layers (iv), and methods that take a step back on established KD +practices by optimizing mean squared error (MSE) between teacher-student +logits or reusing the teacher classifier (ii, vi). +Every method will be explained with loss functions, additional hyperparameters, +and training parameters. (i) Vanilla KD [178] optimizes a linear combination +of hard-target student cross-entropy (CE) loss and Kullback Leibler (KL) +divergence loss with soft-target teacher predictions, including loss KD +hyperparameters α ∈ [0, 1] and the temperature τ > 1, which gives more +weight to student loss and controls the softness of teacher logits, respectively. + +LKD = α LCE (y, ŷ s ) +(1 − α) τ 2 LKL f t (x), f s (x) +| {z } +| +{z +} +τ =1 + +τ >1 + +(ii) MSE loss between teacher-student logit vectors enables direct logit-level +matching [217] +2 +LMSE = f˜s (x) − f˜t (x) 2 +(iii) NKD Normalized KD loss [509] decouples vanilla KD into a normalized +(indicated N ) combination of the target (c ∈ Y) loss and the non-target loss in +CE form, where γ ∈ [0, 1] is a trade-off hyperparameter and τ the temperature. + + \ No newline at end of file diff --git a/assets/txts/pg_0188.txt b/assets/txts/pg_0188.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ce9771000b4b065b854e5374eda394a4b3b7484 --- /dev/null +++ b/assets/txts/pg_0188.txt @@ -0,0 +1,73 @@ +156 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +LNKD = [f t (x)]c [f˜s (x)]c −γ · τ 2 · +| +{z +} +target + +K +X + +N [f t (x)]τk + + + +N [f˜s (x)]τk + + + +k6=c + +| + +{z + +non-target + +} + +(iv) FitNet [394] enables feature-based KD by minimizing the Euclidean +distance between the intermediate feature maps of the teacher and student +networks (i.e., MSE loss). A trainable projector P(·) (e.g., a linear projection +layer) is required if the dimensionality of the hint layer(s) h ∈ [1, L + 1] outputs +does not correspond to that of the student, There are no hyperparameters, +except for projector design and where to place hint layers in the teacher network. +(v) ReviewKD [67] uses multi-stage information (multiple layers) of the teacher +to supervise one student layer. The knowledge review mechanism is too complex +to cover here as it involves multiple modules (residual learning, attention-based +fusion projector, and a hierarchical context loss). This work claimed the first +exploration of KD for instance segmentation, which is why we include it only +for DLA. +(vi) SimKD [63] is a hybrid KD method that combines the advantages of +response-based and feature-based KD. On the one hand, it reuses the pretrained, +s +frozen teacher classifier for student inference (fLt (P(fL−1 +(x))), and on the +other hand, it adopts MSE for feature alignment (following a projector) of the +penultimate layer feature-representations. Note that the former classification +output is not used for training or loss calculation, only the latter projected +feature map alignment. + t + +s +LSimKD = LMSE P fL−1 +(x) , fL−1 +(x) +While the projector can safely be discarded for (iv,v) to obtain cost-free student +inference, SimKD requires both the trained projector and teacher classifier +to be used (and stored) for student inference. SimKD originally proposed a +CNN-based projector between teacher and student feature maps (assuming +C(hannels) x H(eight) x W (idth) inputs). For compatibility with ViT-based +architectures, we contribute a novel variant of SimKD, which uses a linear +projection layer on the [CLS] token at the penultimate layer. Alternatively, we +draw upon [77, Theorem 1] that a multi-head self-attention layer can simulate a +convolutional layer, subsequently reshaping the penultimate hidden layer output +(ignoring [CLS] pooling) to (C x W x H), where C is the hidden size (e.g., +197(-1) for ViT-B), and W, H are equal to the number of patches (e.g., 14 for +ViT-B with patch size 16 and image sizes 224x224), finally applying the original +CNN projector to obtain the projected feature maps. + + \ No newline at end of file diff --git a/assets/txts/pg_0189.txt b/assets/txts/pg_0189.txt new file mode 100644 index 0000000000000000000000000000000000000000..d17b75a0ce94e99ecc7f80852bd803704cf2aab3 --- /dev/null +++ b/assets/txts/pg_0189.txt @@ -0,0 +1,41 @@ +EXPERIMENTAL SETUP + +157 + +Task considerations The number of KD methods considered between the tasks +differs, as some methods were not designed for use in a meta-architecture like +Mask R-CNN. Response-based methods using logits are not capable of providing +knowledge for object localization (e.g., region proposal network head), making +feature mimicking of vital importance. Moreover, the performance of instance +segmentation highly depends on the quality of deep features to locate interested +objects [509, 541], which is why we only consider feature-based KD methods +for DLA (v, vi). When deciding upon KD methods to include, the literature +reported ReviewKD as the feature-based SOTA, NKD as the response-based +SOTA, and SimKD as the hybrid SOTA on image classification (CIFAR-100). + +6.3.4 + +Evaluation + +Metrics Predictive performance evaluation for DIC follows standard practice +with accuracy, whereas we forego the F1 score as the classes are balanced. +For DLA, we use the standard metrics of mean average precision (mAP) @ +intersection over union (IOU) [0.50:0.95] of bounding boxes. +Efficiency evaluation considers the combination of parameter size and FLOPS +(floating point operations) to be representative enough to compare distilled +models. +Following calls in the DU literature [468] to establish calibration and +confidence ranking as defaults to the evaluation methodology, we include +Expected Calibration Error (ECE) [156, 332, 340] to evaluate top-1 prediction +miscalibration and Area-Under-Risk-Coverage-Curve (AURC) [138, 193] to +measure the error rate over selective (% of test set) accuracy (detailed in +Section 2.2.3). + +Covariate shift DIC-KD evaluation To evaluate the robustness of distilled +models, we consider evaluating the impact of domain shift on the downstream +task of DIC. Luckily, there exists a dataset similar to RVL-CDIP in terms of +document types and classes, yet different in terms of document sources and +label distribution. This dataset is called RVL-CDIP-N [241], and we will use it +to evaluate the robustness of distilled models. + + \ No newline at end of file diff --git a/assets/txts/pg_0190.txt b/assets/txts/pg_0190.txt new file mode 100644 index 0000000000000000000000000000000000000000..c962fe1325d83916ac632dce15e3dcd29386134a --- /dev/null +++ b/assets/txts/pg_0190.txt @@ -0,0 +1,46 @@ +158 + +6.3.5 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +DLA-enriched LLM prompting + +Downstream DLA-KD evaluation An important objective of this work is to +demonstrate the usefulness of DLA predictions in downstream VRD tasks. As +SOTA DLA models are often as cumbersome (parameter size, GFLOPS) as the +downstream models, this motivates the need for KD to obtain more efficient +DLA predictors that could be used to enrich document inputs with logical +layout information. +While we focus on visual-only document inputs in benchmarking KD, we take +the opportunity to benchmark DLA as part of a zero-shot DocVQA task setup +with text-only LLMs [482], which can benefit from additional layout information +when answering questions that appear in certain logical elements (’what is the +first column header of Table 3’, ’what is the title of the document?’). Similarly, +it could benefit to know what falls within an infographic picture or legend; +which is why we benchmark on SP-DocVQA and InfographicVQA, with the +latter containing more visually-rich information. As a model of choice, we +have opted for Llama-2-7b-chat [452] with 4-bit quantization to keep GPU +memory requirements to a minimum, while still performing sufficiently reliably. +Evaluation is done using ANLS [39, 468] on predicted answers vs. ground +truths. +The prompt design follows [482] with a task instruction and placeholders for +the question and the document input, the latter depending on the prompt +parameterization (see Tab. 6.2). Possible values are plain, single-spaced OCR +tokens, space, tokens placed heuristically with whitespaces in their approximate +position, or DLA, which adds start and end tags such as
and +to indicate logical layout as predicted by a DLA model. A pseudo-algorithm +(Sec. 6.3.5) details the procedure to generate DLA-enriched prompts. +KIE is regarded as an important downstream DU task, yet we believe (as +supported by [166]) that it would benefit less from DLA, due to most information +being organized as key-value pairs with only local context relevance. + +6.4 + +Results & Discussion + +DLA-KD This work investigates different SOTA KD methods and integrates +them into the DLA framework with ResNet and ViT feature extraction +backbones. KD in DLA poses significant challenges owing to the intricate + + \ No newline at end of file diff --git a/assets/txts/pg_0191.txt b/assets/txts/pg_0191.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f6d8e03178f21a731941d9bcceaf77bac0fac02 --- /dev/null +++ b/assets/txts/pg_0191.txt @@ -0,0 +1,156 @@ +RESULTS & DISCUSSION + +159 + +Algorithm 1: Construction of DLA-enriched prompts pDLA + +4 + +Input: A finite set Dtest = {(x(i) , y(i) )}N +i=1 of holdout data, consisting of document +images x(i) and corresponding labels y(i) +Output: Tokenized DLA-enriched prompts pDLA +Parameters : ζiou : IoU-threshold for layout-token boxes (default: 0.3) +Parameters : Ignore-labels: DLA labels to ignore for enrichment (default: {‘Text’}) +Input +: A document image v +Require: A trained DLA model and an OCR engine +(1) Feed image to DLA model to obtain labeled layout boxes +{(bj , cj , mj )}J +// Boxes, classes, metadata +j=1 ← DLA(v) +Feed image to OCR engine to obtain tokens and boxes + +5 + +u = {(wt )}T +t=1 , s = + +1 +2 +3 + +6 +7 +8 +9 + + + +x1t , yt1 , x2t , yt2 + + + +T +t=1 + +← OCR(v 0 ) + +Standardize layout boxes to similar xy-format +for j ← 1 to J do +bj ← StandardizeBbox (bj ) +if OCR image dims 6= DLA image dims then + +// Tokens and token-boxes + +// Standardize to xy-format + +// Precomputed OCR (DUE) results can be reused, yet OCR images can have higher resolution + +10 +11 + +Interpolate layout boxes to token-boxes +bj ← InterpolateBbox (bj , v, v 0 ) +// Interpolate layout box to OCR image size + +12 + +13 +14 +15 + +(2) Find closest start and end token-boxes +Input +: a set of DLA predictions DLA(v), a set of OCR tokens u, a set of OCR +token-boxes s +Output +: an updated set of OCR tokens û, a set of OCR token-boxes ŝ +for j ← 1 to J do +S ← (0, ∞); E ← (−1, ∞) +// Initialize start and end with dummy index and distance values +for t ← 1 to T do +// Multiple relaxing heuristics to find closest token-box to layout-box + +17 + +if cj ∈ Ignore-labels then +continue + +18 + +if not FullyContains(bj , st ) or IntersectionOverUnion(bj , st ) > ζiou then + +16 + +// Token-box fully contained within layout-box or IoU > threshold + +continue + +19 + +// Minimal Laplacian distance to cornerpoint + +20 +21 +22 + +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 + +S ← min(S, (t, Laplacian(bj , st ))) +E ← min(E, (t, Laplacian(bj , st ))) + +// Laplacian distance to top-left corner +// Laplacian distance to bottom-right corner + +(3) Insert DLA labels before and after closest tokens +Input +: The original sets of OCR tokens u, token-boxes s, and start and end +indices S and E +Output +: Updated sets of OCR tokens û and token-boxes ŝ +C←0 +// Initialize token insertion counter +û, ŝ ← u, s +// Initialize to be updated OCR tokens û and token-boxes ŝ +I ←SortAndLabel(S,E) +// sort start and end token together by index and add label type +for j ← 1 to |I| do +if Ij is a start token then +û ← insert at Ij + C +// Insert label such as
before token +ŝ ← insert bj at Ij + C +C ←C+1 +if Ij is an end token then +û ← insert at Ij + C + 1 +ŝ ← insert bj at Ij + C + 1 +C ←C+1 +return û, ŝ + +// Insert label such as
at next token + +// Tokens and token-boxes with DLA labels to be used in prompt design of [482] + + \ No newline at end of file diff --git a/assets/txts/pg_0192.txt b/assets/txts/pg_0192.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbb84112ea0b2faa6429cddc3e238a65e1221071 --- /dev/null +++ b/assets/txts/pg_0192.txt @@ -0,0 +1,280 @@ +160 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +Table 6.2. +Prompt design following [482], with placeholders depending on +parameterization of document input (plain, space, DLA). +#l +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 + +Prompt +You are asked to answer questions asked on a document image. +The answers to questions are short text spans taken verbatim from the document. +This means that the answers comprise a set of contiguous text tokens present in the document. +Document: +{Layout Aware Document placeholder} +Question: {Question placeholder} +Directly extract the answer to the question from the document with as few words as possible. +Answer: {} + +Table 6.3. Results for KD methods applied on DocLayNet [362]. + +Teacher + +Student + +Method + +mAP↑ + +Flops↓ + +Params↓ + +Im/s↑ + +ViT-B +R101 +- + +ViT-T +R50 + +Supervised +Supervised +Supervised +Supervised + +65.65 +73.56 +62.85 +72.43 + +107G +60G +68G +33G + +114M +63M +26M +44M + +20 +12 +14 +12 + +R101 + +R50 + +ViT-B + +ViT-T + +SimKD +ReviewKD +SimKD +ReviewKD + +62.71 +61.17 +57.51 +57.2 + +29G +37G +42G +84G + +44M +44M +26M +26M + +21 +19 +22 +17 + +nature of detection, introducing new obstacles related to regression, region +proposals, and sparser label volumes [64]. As motivated in Sec. 6.3.3, we +prioritize feature-based KD methods, with results on DocLayNet in Tab. 6.3. +The performance comparison in terms of mean average precision mAP and +FLOP counts show that Resnet-50 students with SimKD are overall superior in +Table 6.4. Validation ANLS (scaled to %) of Llama-2-7b-chat [452] on SP-DocVQA +[309] (top) and InfographicVQA [310] (bottom), where (if marked) the prompt is +enriched with DLA predictions from a ViT-B-based Mask-RCNN. +space + +task + +DLA + +X +7 +X +7 + +X +X +X +X + +X +X +7 +7 + +space + +task + +DLA + +X +7 +X +7 + +X +X +X +X + +X +X +7 +7 + +ANLSval Image/Photo Yes/No Figure/diagram Form Free_text Handwritten Layout Others Table/list +61.2 +58.39 +62.46 +57.63 + +44.58 +44.43 +42.95 +45.38 + +49.13 +41.67 +49.43 +51.52 + +40.28 +34.81 +40.93 +34.97 + +68.95 +66.38 +71.15 +67.88 + +68.39 +67.82 +70.59 +69.71 + +52.81 +52.1 +55.87 +53.19 + +61.38 +59.19 +61.87 +55.51 + +56.44 +55.91 +61.05 +55.78 + +56.7 +52.79 +58.31 +53.81 + +ANLSval Arithmetic Comparison Counting Figure Map Multi-span Abs Q span Single span Table/list Text Visual/layout +28.05 +28.36 +27.97 +29.08 + +9.92 + +25.28 + +7.83 + +26.28 19.0 + +21.85 + +8.82 + +41.84 + +33.54 + +25.57 + +34.6 + +29.17 + +14.93 +9.78 +14.15 + +29.15 +25.13 +26.94 + +7.64 +6.99 +11.35 + +27.05 19.0 +25.93 21.04 +27.52 19.1 + +19.41 +22.33 +19.79 + +11.21 46.87 +8.2 43.36 +12.79 48.44 + +33.35 +33.53 +33.79 + +25.56 +25.76 +26.17 + +34.59 +35.06 +35.24 + +26.69 +27.47 +26.39 + + \ No newline at end of file diff --git a/assets/txts/pg_0193.txt b/assets/txts/pg_0193.txt new file mode 100644 index 0000000000000000000000000000000000000000..3acd835d931478810cea4c2843c02db7d90f87a6 --- /dev/null +++ b/assets/txts/pg_0193.txt @@ -0,0 +1,44 @@ +RESULTS & DISCUSSION + +161 + +terms of both efficiency and detection, while ViT-Tiny student has the smallest +number of parameters with comparable performance in terms of mAP. +)However, one can observe a generally large knowledge gap between the teacher +and student model (≈ 8% for ViT and ≈ 10% for the ResNets) as the crucial +details about the document object boundaries, shapes, and sizes can get lost +during the compression process. Not only that, KD performance with a ViT +backbone is worse compared to Resnets due to (i) the attention overhead, i.e., +transferring this attention-based knowledge to a student model requires careful +consideration of how to distill these complex attention patterns effectively, and +(ii) initialization and hyperparameter sensitivity, e.g., finding an appropriate +domain pretrained checkpoint and setting patch sizes, attention heads, can affect +the KD process, requiring more delicate tuning. The CNN layers of Resnets are, +on another hand, permutation invariant and provide more flexibility towards +KD. +KD methods are hard to integrate for object detection frameworks, especially +when it comes to ViTs where there is no intermediate multi-scaled FPN module. +Our contribution lies in extending the hybrid SimKD [63] method for the +DLA task and also showing competitive analysis with the existing SOTA +ReviewKD [67]. +Downstream DLA-KD Tab. 6.4 reports results on the validation sets as these +are hyper-annotated with evidence, question and answer types, and operations, +allowing for more finegrained analysis. Detail results of distilled DLA-enriched +prompts are available in Appendix D.4. +On SP-DocVQA, DLA-enriched prompting (without spacing) improves from +57.63 → 58.39, whereas (with spacing) the improvement (27.97 → 28.05) is +less pronounced on InfographicVQA, yet DLA predictions are still useful in +this setting, as also evidenced by questions involving ’Visual/Layout’. This is +likely due to the more visual and layout complexity of the dataset, wherefore +DLA predictions are less accurate. Strikingly, spacing performs generally worse +on Infographics, pointing to the heuristic nature of the structure-preserving +OCR algorithm of [482] that fails on structurally complex documents with +visually-situated language, charts with axes labels, legends, etc. +The objective of these experiments was to make (distilled) DLA output useful +in enriching text-only LLMs with more semantic layout information beyond +geometric-spatial relations. For every setting tested, the task instruction +(Sec. 6.3.5) is vital (else ANLS < 5%) in the zero-shot setting. We hypothesize +that for SP-DocVQA line/row/column-level key-value pair recognition suffices +for attaining good performance, thus expecting little benefit from DLA-enriched + + \ No newline at end of file diff --git a/assets/txts/pg_0194.txt b/assets/txts/pg_0194.txt new file mode 100644 index 0000000000000000000000000000000000000000..539574b060658bcf1f0a3f2a3898209e0882f134 --- /dev/null +++ b/assets/txts/pg_0194.txt @@ -0,0 +1,36 @@ +162 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +prompts. However, as these experiments are bound to the layout classes as +pre-defined in DocLayNet, we believe that richer layout information, closer to +semantic regions (e.g., an address block instead of an OCR block), and including +specification of common document objects such as stamps, logos, watermarks, +should benefit downstream DU tasks. +Table 6.5. Performance per KD method over metrics averaged over architectures on +RVL-CDIP dataset (In-Domain) and RVL-CDIP-N dataset (Out-Of-Distribution). + +DIC-KD This task benchmark reports on experiments with 3 backbones, +2 student architectures (except 1 for Resnet), and 6 KD methods each. +Tab. 6.6 details the ViT and DiT results, whereas the ResNet results (following +similar trends) are available in Appendix D. The same set of experiments was +repeated for randomly initialized students (Tabs. D.12 and D.13). Given the +comprehensive scope of the DIC experiments, we can make claims regarding the +overall most performant KD method, the teacher-student capacity gap, and the +architecture-pretraining gap. ViT-Small student distilled with the SimKD [63] +method performs best in terms of accuracy and AURC. Note that the best +ViT-Tiny student with only 5.5M parameters reaches 83% accuracy with SimKD, +only 2.9% behind the best ViT-Small student with 86M parameters, showing the +potential of advanced KD methods in retaining accuracy at such a large capacity +gap. SimKD performs admirably in terms of accuracy, sometimes (depending +on the projector type (MLP and CNN)) as well as the supervised teacher. In +terms of AURC, NKD and MSE approaches are best-performing, which are +both response-based methods. Regarding the pretraining gap, as shown in +Tab. 6.6, results indicate that a self-supervised teacher like DiT does not meet +expectations when distilling the knowledge to a ViT-based student pretrained +with ImageNet weights. This could be attributed to the large representation gap +in the feature space between the RVL-CDIP pretrained and ImageNet pretrained +models. However, evaluation under covariate shift on RVL-CDIP-N (Tab. D.8) +demonstrates DiT-based students (distilled with response-based KD strategies) + + \ No newline at end of file diff --git a/assets/txts/pg_0195.txt b/assets/txts/pg_0195.txt new file mode 100644 index 0000000000000000000000000000000000000000..540acddc76ab63bb2baada7e72630c1ca09ee9f4 --- /dev/null +++ b/assets/txts/pg_0195.txt @@ -0,0 +1,185 @@ +CHAPTER CONCLUSION + +163 + +to outperform ViT→ViT students, pointing to the potential of self-supervision +for robustness to distribution shift. +Table 6.6. Results of different KD strategies benchmarked for D/ViT-B teachers +applied on the RVL-CDIP dataset. +Student +– +– +– +ViT-S + +ViT-T + +Method +ViT-B +ViT-S +ViT-T + +ViT-B + +Vanilla [τ = 2.5, α = 0.5] +NKD [τ = 1, γ = 1.5] +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet [middle] +Vanilla [τ = 2.5, α =] +NKD [τ = 1, γ = 1.5] +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet [middle] + +ACC +0.891 +0.853 +0.822 +0.854 +0.840 +0.855 +0.859 +0.847 +0.843 +0.825 +0.815 +0.823 +0.830 +0.829 +0.812 + +AURC +0.017 +0.030 +0.040 +0.028 +0.036 +0.028 +0.028 +0.062 +0.048 +0.038 +0.046 +0.040 +0.095 +0.056 +0.051 + +ECE Student +0.034 +– +0.058 +– +0.043 +– +0.049 ViT-S +0.074 +0.051 +0.287 +0.141 +0.141 +0.058 ViT-T +0.094 +0.066 +0.163 +0.150 +0.153 + +Method +DiT-B +ViT-S +ViT-T + +DiT-B + +Vanilla [τ = 2.5, α = 0.5] +NKD [τ = 1, γ = 1.5] +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet [middle] +Vanilla [τ = 2.5, α =] +NKD [τ = 1, γ = 1.5] +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet [middle] + +ACC +0.933 +0.831 +0.801 +0.831 +0.790 +0.831 +0.838 +0.851 +0.775 +0.801 +0.772 +0.795 +0.816 +0.832 +0.753 + +AURC +0.075 +0.042 +0.053 +0.060 +0.058 +0.060 +0.087 +0.048 +0.063 +0.064 +0.066 +0.076 +0.104 +0.056 +0.077 + +ECE +0.010 +0.056 +0.047 +0.080 +0.040 +0.082 +0.438 +0.136 +0.077 +0.081 +0.041 +0.081 +0.439 +0.152 +0.054 + +Covariate shift DIC-KD To answer if certain KD methods harm a student +model’s robustness to covariate shift, we plot results per KD method, averaged +over the 3 backbones on the (Tab. 6.5). This re-establishes the superiority +of SimKD [CNN] in terms of accuracy, both ID and OOD, yet due to poor +calibration, it loses gain on the teacher in terms of AURC. Strikingly, MSE +attained the lowest OOD performance, whereas it was a solid ID choice. Tab. D.8 +provides more detail on the performance of the different KD methods on RVLCDIP-N, where we observe that grouped per KD strategy response-based is +superior over all metrics. + +6.5 + +Chapter Conclusion + +KD-based model compression has been a popular technique in recent years, +albeit DU research has not paid much attention to efficiency. Our work explores +a limited scope of KD for DU at scale, revealing great potential for creating +efficient counterparts of cumbersome DLA models used today. Specifically, we +show that SimKD is a particularly strong KD method, always outperforming +vanilla KD and even obtaining a 16x smaller model retaining >90% relative +accuracy. Moreover, we investigate the potential of DLA for enriching document +inputs in downstream DocVQA tasks. Traditionally, DocVQA has relied on plain +OCR text. While structure-preserving OCR provides a notion of geometric + + \ No newline at end of file diff --git a/assets/txts/pg_0196.txt b/assets/txts/pg_0196.txt new file mode 100644 index 0000000000000000000000000000000000000000..c18b4977935bff3c0e05e26671e850e91e16b11e --- /dev/null +++ b/assets/txts/pg_0196.txt @@ -0,0 +1,30 @@ +164 + +DISTILDOC: KNOWLEDGE DISTILLATION FOR VISUALLY-RICH DOCUMENT APPLICATIONS + +layout for downstream use, DLA was never considered before for the same +purpose, yet our experiments show promise. +The more comprehensive benchmarking of KD methods in DIC with ID +evaluation and a covariate shift protocol reveals interesting observations +regarding the feature representation and weight initialization gap between +DiT (documents) and ViT (natural images), albeit self-supervision for students +is more robust in the OOD setting. Our framework enables informed selection of +compressed models and directs several interesting explorations: how pretraining +objectives impact the distillation process, if different layout representations +(e.g., [15, 187, 263, 443, 555]) allow for a more robust downstream transfer, etc. +Limitations While we primarily use DocLayNet, it remains the DLA dataset +with the most diversity in layout elements both in terms of categories and shape +or size. However, the downstream DocVQA results urge for more diversity +in terms of document types, domains, and objects (e.g., layout objects such +as logos, watermarks, stamps, signatures). Thus, the community is in dire +need of a dataset diverse enough to guarantee a performance improvement +downstream. Moreover, multimodal KD was not considered in this work, +holding promise for more efficient, all-round DU models. The downstream task +was not tested on [468] as multipage documents are more complex to benchmark +with limited sequence length LLMs. Also, DLA being a fairly complicated +instance segmentation task, makes it difficult to adapt for KD-based model +compression, ruling out some KD methods. This calls for a better experimental +framework and architectural modeling to boost the exploration of KD in DLA, in +turn, incubating downstream advances in processing and understanding VRDs. + + \ No newline at end of file diff --git a/assets/txts/pg_0197.txt b/assets/txts/pg_0197.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a27bfa2b4a7dbcdf2572753674bda416bb97d2a --- /dev/null +++ b/assets/txts/pg_0197.txt @@ -0,0 +1,30 @@ +Chapter 7 + +Conclusion +This final chapter summarizes the work done in this thesis. Additionally, we +formulate the key contributions and propose some exciting avenues for future +research. + +7.1 + +Summary + +To summarize, this thesis contains the following contributions (C) and key +findings (→), respective to the research questions from the introduction: +When tested in realistic language data distributions on various text classification tasks, +how well do PUQ methods fare in NLP? +In which settings are PUQ methods most useful, i.e., which failure sources/distribution +shifts are the most sensitive? + +C 1. We conduct a benchmarking study of established PUQ methods applied +to six real-world text classification datasets with a focus on model robustness +and uncertainty quality. This large-scale study comes with advanced statistical +analysis to validate significant differences between methods and datasets. +C 2. We propose a practical experimental methodology to test relevant +distributions shifts —cross-domain classification and novelty detection—, +resulting in a better understanding of the individual shortcomings of PUQ +methods. + +165 + + \ No newline at end of file diff --git a/assets/txts/pg_0198.txt b/assets/txts/pg_0198.txt new file mode 100644 index 0000000000000000000000000000000000000000..764e9237f1d61c7ea7f74f477a4fca657719c1f9 --- /dev/null +++ b/assets/txts/pg_0198.txt @@ -0,0 +1,35 @@ +166 + +CONCLUSION + +→ General behavior of PUQ methods does not hold over different datasets. We +do observe specific correlations between PUQ methods and the problem setting +representing task characteristics, for which we formulated practical takeaways. +This reconfirms the need for modality to task-specific benchmarking of PUQ +methods. +→ In general, PUQ methods are sensitive to distribution shifts and methods that +exhibit better in-domain calibration also exhibit better robustness to novel class +shifts. The tested setting of cross-domain classification under covariate shift +is the most challenging for PUQ methods. This is evident from relatively low +AUROC scores due to the presence of comparably similar linguistic patterns +across domains. +How can we obtain better PUQ estimates without overrelying on computationally +prohibitive methods, e.g., Deep Ensemble [238]? + +C 3. We propose novel combinations of PUQ methods, providing both wellmotivated intuition and empirical evidence for the complementary benefits of +combining different posterior approximation procedures. + +→ Our proposed hybrid PUQ methods improve over singular methods, both in indomain calibration, novelty detection, and out-of-domain detection. In particular, +we show that the combination of Deep Ensemble with Concrete Dropout +demonstrates higher diversity in posterior samples and superior performance, +even at a smaller ensemble size compared to a Deep Ensemble. +How important are certain prior, neural architecture or hyperparameter influences on the +quality of PUQ estimation? + +C 4. We conduct a range of ablation experiments to investigate the influence of +prior, neural architecture and hyperparameter choices on the quality of PUQ +estimation. In particular, the number of stochastic posterior samples, the dropout +rate, and the architecture are shown to have a significant impact on the quality +of PUQ estimation. + + \ No newline at end of file diff --git a/assets/txts/pg_0199.txt b/assets/txts/pg_0199.txt new file mode 100644 index 0000000000000000000000000000000000000000..077b5c0d64b92ca92293adea39fa54193dd891d1 --- /dev/null +++ b/assets/txts/pg_0199.txt @@ -0,0 +1,31 @@ +SUMMARY + +167 + +→ The combination of posterior geometry and weight-based priors proves to be +a powerful combination for PUQ estimation, with the Deep Ensemble and +Concrete Dropout methods as the best-performing methods in our benchmark. +Nevertheless, it is important to consider adapting the dropout rate to the text +classification task at hand, which individually and in an ensemble impacts model +robustness and uncertainty quality. +→ Contrary to previous work, we find that pretrained transformers in NLP +severely underperform in novelty detection compared to 1D CNNs, limiting the +applicability of transfer learning when distribution shift from novel classes can +be expected. +How severe is the problem of hallucination and control in LLMs when evaluated in a +selective, free-form DocVQA task setting? + +C 5. We design the DUDE dataset with this task setting in mind, incorporating +a large set of unanswerable questions that are realistic and relevant to the +document’s content. +→ Hallucination and control remain severe problems in LLMs, with a large fraction +of unanswerable questions being answered with high confidence. When trained on +a large set of unanswerable questions, LLMs improve on identifying unanswerable +questions, yet at the expense of abstractive, harder questions to which they +become overcautious (e.g., ChatGPT predicting more than 1/2 of abstractive +questions as unanswerable). With longer context, LLMs are also more likely +to hallucinate answers. Overall, results are lagging behind the human baseline +performance on DUDE, indicating that LLMs are still far from being able to +reason about documents in their entirety without control measures. + + \ No newline at end of file diff --git a/assets/txts/pg_0200.txt b/assets/txts/pg_0200.txt new file mode 100644 index 0000000000000000000000000000000000000000..dba26c61ef7ea379c56219d8db015f1da1e9927e --- /dev/null +++ b/assets/txts/pg_0200.txt @@ -0,0 +1,29 @@ +168 + +CONCLUSION + +How can we iteratively close the gap between research and practice in DU? + +C 6. We take stock of the balance between research and applications in document +classification, a prototypical DU task, and we identify the main challenges that +are stalling progress in the field, with a focus on data construction and evaluation +methodology. +C 7. We propose a novel formalization of multipage document classification +scenarios, which we use to construct two novel datasets, RVL-CDIP_MP and +RVL-CDIP-N_MP, that are more realistic and more challenging than their +single-page counterparts. +C 8. We conduct an insightful experimental analysis of the novel datasets. + +→ The experimental analysis reveals that current SOTA models are not able to +leverage the additional context provided by multipage documents and that the +performance gap between single-page and multipage document classification +is still large. Ablation experiments show the promise of advancing multipage +document representation learning and inference. +→ Major dataset construction efforts are required to bridge the currently existing +gap and be able to rely on benchmarks for transfer to real-world applications. +In particular, we identify the need for more realistic and more challenging +datasets, about e.g., the type and diversity of document data, and the variety +and quality of label sets, as well as the need for more comprehensive evaluation +methodologies. + + \ No newline at end of file diff --git a/assets/txts/pg_0201.txt b/assets/txts/pg_0201.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5549ac37bd0bb7b32847e9f4c2d04f39d7bcc68 --- /dev/null +++ b/assets/txts/pg_0201.txt @@ -0,0 +1,29 @@ +SUMMARY + +169 + +How can we design a resource that comprehensively challenges the state-of-the-art? +Which DU aspects are most challenging for current state-of-the-art LLMs? How can these +be incorporated in a benchmark to allow proper measurements of future improvements? + +C 9. We have designed a completely novel benchmark from the ground up, +DUDE, collecting 40K QA pairs for 5K documents, constructing a multifaceted (multipage (µ = 6), multi-domain (±15), multi-type (±200), multiQA (extractive, abstractive, list, unanswerable), multi-task (DIC, KIE, DLA, +DOD, etc.), multi-OCR (Tesseract, Azure, AWS), multi-source, multi-stage (<5) +annotations) dataset to foster research on generic DU, bypassing long context +restrictions and evaluating the reliability and robustness of DU technology, as +close as possible to real-world requirements. +C 10. The dataset construction approach of DUDE is based on a set of principles +that we have formulated, which we believe are essential for a comprehensive +benchmark for generic DU. More specifically, leveraging the DocVQA task +paradigm and learning paradigm of Multi-Domain Long-Tailed Recognition +allowed us to both incentivize harder questions on visual/layout semantics, +layout navigation, or multi-step reasoning, while organically obtaining questions +relevant to the document type and instance. +C 11. We have conducted our own baseline experiments of DUDE, evaluating the +performance of SOTA DU models on the different facets of DUDE, as well as the +reliability and robustness of LLMs in the context of DU. Next, we have organized +a competition to challenge the community’s best, additionally incorporating +OOD detection and selective generation to evaluate CSFs on two common failure +sources. + + \ No newline at end of file diff --git a/assets/txts/pg_0202.txt b/assets/txts/pg_0202.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6fd5aa7fbd7e477678a5c012fb255691d4827e --- /dev/null +++ b/assets/txts/pg_0202.txt @@ -0,0 +1,42 @@ +170 + +CONCLUSION + +→ The best results attain ANLS <= 50% with our baseline T5-2D (8K context) +scoring 46%, the competition winner improves 4% absolute by leveraging +multimodal LLMs (BLIP2 and ChatGPT). Generally, stronger performance +is expected from models that incorporate layout understanding and reasoning +over multiple pages. Nevertheless, diagnostic results prove that the current +SOTA still suffers on questions with visual evidence (only half of the human +performance) or any reasoning operations (counting, comparison, etc.). With +the rise of multimodal LLMs, better solutions are coming, yet due to its designed +complexity, DUDE might remain “the benchmark to beat" for a long time. +→ Even while DUDE presents a great test bed for the challenge of long-context +processing (Section 2.3.4.1), the evaluated models have not yet reached the point +where they can fully leverage the additional context. This is a clear indication +that more research is needed in the direction of efficient processing of long, +structured documents. +→ We find that the quality of confidence estimation worsens with longer context, +potentially from having to consider more possible answers. We also find that +models using a maximum confidence strategy over answers generated per page +results in substantially worse calibration. These interactions between multiple +DU challenges prove the usefulness of incorporating and evaluating these jointly +in a benchmark. +How can we efficiently infuse LLMs with semantic layout awareness for more focused +information extraction? +To what degree can model compression resolve the problem of efficiency in processing +documents? + +C 12. We propose a novel experimental methodology to investigate enrichment +of VRDs with semantic layout structure derived from effective distillation of +DLA models to practically and efficiently improve downstream DU applications. +This includes evaluation under covariate shift of KD methods in DIC and a +downstream evaluation setup to evaluate the robustness of distilled DLA models +on zero-shot layout-aware DocVQA. +C 13. We present the first application of KD to visual document tasks (DIC, +DLA), investigating the teacher-student knowledge gap in KD-based model +compression methods (response and feature-based) with task architectures +involving different inductive biases (CNN vs. ViT), pretraining (self-supervised), +student initialization, and capacities (base-small-tiny). + + \ No newline at end of file diff --git a/assets/txts/pg_0203.txt b/assets/txts/pg_0203.txt new file mode 100644 index 0000000000000000000000000000000000000000..260b5b3f55281759863594597306c9edc80cae4d --- /dev/null +++ b/assets/txts/pg_0203.txt @@ -0,0 +1,44 @@ +PERSPECTIVES FOR FUTURE RESEARCH + +171 + +→ While we have promoted the use of semantic layout over geometric layout for +enriching LLM prompts, this only results in limited improvements in performance, +which we attribute to either the zero-shot evaluation setup or the limited subset +of layout classes and domain shift from the DLA training data (DocLayNet). +In some cases, e.g., questions involving visual/layout evidence, DLA-enriched +prompting proves more useful. +→ KD-based model compression is very effective in reducing model size, while +maintaining accuracy at large capacity gaps, e.g., a strong student is SimKD +ViT-tiny, which retains relatively 93% of teacher accuracy, while being 16x +smaller. Ablations show how the teacher-student knowledge gap is affected by +the inductive biases of the task architecture, the pretraining of the student, the +student initialization, and the student capacity. For example, a self-supervised +teacher provides more robust students when evaluated under covariate shift. +Nevertheless, model compression is but one tool in a larger toolbox for efficient +processing of documents, which we believe is a key challenge going hand-to-hand +with efficient longer-context modeling, for future research. +As this thesis was conducted in an applied research environment and keeping in +mind that nowadays DL research is primarily empirical, the contributions of our +work have been very focused on datasets and the experimental methodology, +rather than on novel algorithms, which more often than not present mere +incremental improvements on the state-of-the-art. Nevertheless, we believe +that the proposed datasets and experimental methodologies are of great value +to the community, as they provide a more realistic and more challenging test +bed for future DU research. We are happy to see the proposed datasets and +experimental methodologies increasingly being adopted by the community and +hopefully this will foster research on more efficient and closer to real-world +document processing, which will ultimately lead to more reliable and robust +DU technology. + +7.2 + +Perspectives For Future Research + +This Section discusses some exciting research opportunities left for future +research. First, we present a curated set of research questions particular to +PUQ, calibration, and failure prediction, which when relevant are linked to DU +applications. Next, we take a futuristic look at the design of a fully-fledged +IA-DU solution, dreaming up the ultimate dataset and system design for DU. + + \ No newline at end of file diff --git a/assets/txts/pg_0204.txt b/assets/txts/pg_0204.txt new file mode 100644 index 0000000000000000000000000000000000000000..917f59b31ce7692852e146c00cd9da5ea42bcec7 --- /dev/null +++ b/assets/txts/pg_0204.txt @@ -0,0 +1,49 @@ +172 + +7.2.1 + +CONCLUSION + +Open Problems In Reliability & Robustness + +Recent advancements in LLMs have brought a lot of groundbreaking +improvements to the field of DU, yet the reliability of LLMs is still far from being +solved. This is further increased by API-based services or closed-source LLMs +[344], which are to be treated as black-boxes without access to model internals or +token-level output logits, making it hard to apply most PUQ methods. Popular +white-box approaches include verbalized probabilities [273] or semantic entropy +[226] for taking into account semantic equivalence or specificity (e.g., Where +was the 2023 International Conference on Computer Vision held? → In Paris vs. +In the capital of France vs. In Europe). Specific to selective generation, when +knowledge on a topic is limited, it can be hard to censor LLM outputs (even +when finetuning further with human feedback) or evaluate abstention reliably +(e.g., I don’t know vs. I don’t care vs. ‘’). +[111] implement a framework bundling a battery of white-box and black-box +methods for LLM confidence estimation in text generation, yet it still requires +human inspection of generated text together with the confidence score, which +is not very scalable for large-scale document processing. This ties into the +evaluation crisis of LLMs, which is a topic of active research [137]. In the short +term, it might suffice to reward models that predict the full distribution of +human judgments or learn human preferences for generated text. However, how +can we expect models “to do what humans do” when even humans disagree or are +not consistent in their judgments? Alternative approaches can be to rationalize +judgments, attribute or ground evidence used for the judgment, or ask for +clarifications when needed. In the long term, we should move beyond human +evaluation, which is expensive, time-consuming, and not scalable. Important +explorations include prompt chaining (Please give a confidence between 0 and 1 +about how certain you are this is the correct answer) or self-evaluation [207, 391] +to induce reflections on the quality of LLM outputs. +Beyond the potentially infinite, though countable output spaces of generative +tasks, there exists an opportunity to study calibration for specific output +spaces, e.g., sequence-structured in the context of sequence tagging or restricted +sequence-to-sequence tasks. Moreover, calibration metrics and methods can +be adapted to the specific task or output space such as structured prediction +[227], named entity recognition [222], object detection and segmentation [85, +234, 350] etc. With most works (if at all) reporting top-1 miscalibration, efficient +estimation of “stronger" calibration notions is a crucial area of study to inform +the derivation of calibrated regularized loss functions [370]. On the more +theoretical side, it remains vital to investigate the link between non-convex +optimization (e.g., flat minima) and calibration, as well as when optimizing a +proper loss yields calibration [42, 549]. + + \ No newline at end of file diff --git a/assets/txts/pg_0205.txt b/assets/txts/pg_0205.txt new file mode 100644 index 0000000000000000000000000000000000000000..b84b7e40962befd9c2ecba3218d4468e5426844a --- /dev/null +++ b/assets/txts/pg_0205.txt @@ -0,0 +1,45 @@ +PERSPECTIVES FOR FUTURE RESEARCH + +173 + +Selective prediction has been garnering increased attention thanks to intensively +comprehensive benchmarks [127, 193], yet these have (again) been focused on +vision problems and architectures, inviting the same level of benchmarking on +alternative modalities and tasks. To the extent of our knowledge there exists no +work on extending selective prediction methodology to multi-task settings (e.g., +consider the typical combination of document classification and KIE) requiring +a more complex learned CSF (for different output spaces) or a combination of +multiple CSFs with multiple thresholding. Similar to calibration, differentiable +loss functions for failure prediction are an open problem. More theoretical +questions include the relationship between stronger notions of calibration and +confidence ranking, as well as the link between feature space disentanglement and +CSF ranking [552]. In the low-data regime, sample-efficient failure prediction +is an open problem, which could leverage connections to semi-supervised and +active learning [112]. + +7.2.2 + +A Future-Proof Design Of IA-DU + +Downstream datasets are a key component of any practical, supervised ML +solution, yet they are often overlooked in expectation of decent zero-shot +performance with LLMs, which are trained on large-scale, generic language +datasets, such as Common Crawl or the Pile [130]. While these datasets are very +useful for pretraining general language understanding, they are not sufficient +for all possible downstream tasks. This is especially true for DU, where text is +but one of the modalities to be considered. As part of the conclusion to this +thesis, we first discuss how to obtain the ultimate dataset for generic DU, and +next we detail the design of a fully-fledged IA-DU solution. +7.2.2.1 + +The ‘Ultimate’ DU Dataset? + +Arguably, a core contribution of this thesis is the design of the DUDE dataset, +which we believe is a step in the right direction toward the ultimate dataset for +generic DU. Top-of-mind extensions of DUDE include: multilingual or crosslingual documents and questions; answer and evidence grounding to improve +evaluation and interpretability; and question decomposition and simplification. +Finding a complete answer to the question of the ultimate DU dataset would be +transformative to DU technology, yet here we can only provide some pointers, +discussed in the structure of goal, starting points, and aspects to target. + + \ No newline at end of file diff --git a/assets/txts/pg_0206.txt b/assets/txts/pg_0206.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d167d5e55a2ca65f6b80fe9d857f9a8dc5c1ff5 --- /dev/null +++ b/assets/txts/pg_0206.txt @@ -0,0 +1,45 @@ +174 + +CONCLUSION + +Goal DU requires reasoning over documents in their entirety, which is a very +complex task with the aforementioned challenges. With the current technology, +this involves learning document representations that are both rich and compact, +and that can be used to answer any question about the document. Consider +how challenging this is when most relevant questions are either about the +intentionality of the document’s author or the way a user interacts with it, +hinting at a potential observer’s paradox in future data collection. For example, +on a car invoice, an accountant would ask What is the total amount due?, or +Is this a valid invoice with correct taxation?, while a customer would ask How +much do I finally have to pay?, or the insurance broker What is the chassis +identifier to link the omnium coverage to?. A model should be able to capture +all these nuances about the complexity of a document which could be seen as +the expectation of all possible relevant questions that can be asked on it, while +also being able to generalize to unseen documents and questions. Therefore, +the goal of the ultimate DU dataset is to provide a test bed for evaluating +the progress in commonsense reasoning on documents from real-world +interactions, to which we hypothesize that the scale and depth of supervision +are vital. +Starting points The ultimate DU dataset should be designed with the +aforementioned goal in mind, yet some seminal ML datasets could be inspiring. +While the ‘ImageNet moment’ is etched in everyone’s memory, MS COCO +[274] was arguably a more impactful dataset thanks to its large-scale, diverse, +and high-quality nature combining multiple tasks (image captioning, object +detection, semantic segmentation, etc.). To build the equivalent of MS COCO +in document understanding, DUDE offers a good starting point, under some +conditions and necessary extensions. An important aspect concerns ground +truth collection for DocVQA and the complexity and specificity of questions +and answers, which has been approached differently by recent works: DUDE +uses a multi-stage approach to collect a large set of minimally constrained, +human-generated questions under the MDLT paradigm, which were afterward +annotated with diagnostic categories; PDFTriage [400] pre-defines question +types and collects a small set of human-generated questions; DocEdit [311] +establishes a pre-defined taxonomy and tests language as a universal UI to +interact with the hierarchical, discrete structure of documents. The extent to +which the collected QA pairs constitute a representative sample of the space of +all possible and relevant questions that can be asked on a document instance is +an open problem, which can be approached by (A) extending and scaling up +existing practices or (B) deepening supervision for models to generalize better +from limited inputs. + + \ No newline at end of file diff --git a/assets/txts/pg_0207.txt b/assets/txts/pg_0207.txt new file mode 100644 index 0000000000000000000000000000000000000000..69371f16ab3d4db4766f12845cf4b4fe95b4144f --- /dev/null +++ b/assets/txts/pg_0207.txt @@ -0,0 +1,47 @@ +PERSPECTIVES FOR FUTURE RESEARCH + +175 + +A. Scale We identify three targets to scale up: (I) document collection, (II) +question collection and validation, and (III) question-answer generation. +(I) Throughout the document dataset construction, the goal is to collect a +large set of diverse document types and instances, differing on all modalities: +language, layout, visual, etc., and additional meta-criteria: industry, language, +type, etc. The document collection approach taken in DUDE was a fairly +artisanal process: based on experience, we designed an industry-document +taxonomy, which we used to collect a large set of document types and instances, +also taking into account the presence of different visual semantics or document +objects e.g., handwriting, stamps, watermarks, address blocks, etc. We leveraged +a semi-automatically created keyword-style search (’Please list 30 common retail +document types with their synonyms like Credit memos - {"credit notes", "credit +slips", "refund slips"}’) on public document collections, and validated diversity +post-hoc in terms of modality-specific features (TF-IDF or ResNet features) vs. +other datasets. +A more scalable approach would be to leverage a cluster-based diverse sampling +from larger document collections, such as Common Crawl [460]. While this +approach would be more scalable, it would be challenging to ensure that the +collected documents are diverse in terms of all modalities, which is a topic +to be investigated. Relevant caveats are the presence of duplicates, sensitive +information, and the need to balance language priors to not create Clever Hans +effects for models to later exploit [405]. An active topic of research is document +generation [169] or augmentation [304], which could fill the gap in document +diversity, yet it would be challenging to ensure that the generated documents +are both realistic and diverse. Seeing that business documents are hard to +obtain, one could backtrack to visually-situated language. +(II) To ensure that questions are specific to a document, and not testing language +understanding, cross-lingual questions could help counter reliance on language +priors. However, both multilingual documents and cross-lingual questions are +challenging to collect, as they require annotators capable of reading multiple +languages. How people interact (i.e., the questions asked) with documents +without being systematically observed is what makes for interesting data, yet it +is also the more challenging to collect. This is certainly true for subject-matter +experts from different industries (government, finance, legal, etc.) who are not +readily available for annotating documents. Naturally, as more documents +are being collected, one should define a strategy to scale up the number of +questions per document in a balanced way. Ideally, the number of questions per +document should be a function of the document complexity, which is another +open problem. Some basic strategies would be to (i) split questions evenly over +pages by chunked annotation, yet this would constrain multi-hop and naturally +complex questions, or (ii) to exploit the Gestalt principle [294], which states + + \ No newline at end of file diff --git a/assets/txts/pg_0208.txt b/assets/txts/pg_0208.txt new file mode 100644 index 0000000000000000000000000000000000000000..749a5e2c5420ec586ff1495b3a24094067a8275e --- /dev/null +++ b/assets/txts/pg_0208.txt @@ -0,0 +1,42 @@ +176 + +CONCLUSION + +that the number of questions should be higher on heterogeneous elements in +a document. Finally, an untapped approach would be to generate questions +automatically, which is an open research challenge. +(III) QA generation holds promise to grow a large-scale dataset. A possible +approach would be to teach the current SOTA model on DUDE to generate +questions (given possible answers, predict questions) similar to those in the +training set. A harder problem is the generation of unanswerable questions, +which we found hard to even elicit from humans. Potential caveats are the +quality and factuality [303] of the generated questions. This might be improved +upon by first generating rich and compositional captions for a document relative +to the content and visual appearance, and then generating different questions +based on the descriptions, with both paraphrasing and backtranslation for +question variations and augmentations. +B. Supervision Depth The reasoning behind increasing the depth of +supervision is that we might be expecting too much, i.e., answering complex +questions involving multiple manipulations of document-instance and/or domainspecific concepts based on a single set of reference answers, with a poor stimulus +[476], i.e., not providing enough, complex enough and diverse enough examples +for models to generalize well. +Accounting for every possible question will be impossible. A possible approach +inspired by MDLT and diagnostic categories in DUDE is to (i) decompose +questions in terms of the skills and concepts (Definitions 15 and 16) required +to answer it and pass this together as instructions; and (ii) hyperannotate +more explicit answers, with answer and evidence grounding for attribution, +better explaining the relations between primitives (skill-concept compositions). +Figure 7.1 illustrates an example of (ii), where the answer is decomposed into a +skill-concept composition, and the evidence is grounded to the relevant document +objects. Such rich supervision should help models to both discriminate known +skills and concepts and generalize better to new skill-concept compositions. +Although it would be expensive to obtain such supervision in large quantities, +the use of human-in-the-loop or active learning could reduce the annotation +burden. +Definition 15 [concept]. An abstract term to denote document visual objects +(atomic [cell, barcode] and molecular [table, chart, form]), and entities (generic +[document identifier, person, date] and domain-specific [invoice number, insured, +payment date]). +Definition 16 [skill]. Any manipulation [existence, counting, relation, + + \ No newline at end of file diff --git a/assets/txts/pg_0209.txt b/assets/txts/pg_0209.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd193913cf517a2e374ffee2d8f5be6564c1d0e2 --- /dev/null +++ b/assets/txts/pg_0209.txt @@ -0,0 +1,31 @@ +PERSPECTIVES FOR FUTURE RESEARCH + +177 + +Figure 7.1. Example of ground truth formatting for a question-answer pair in DUDE. + +hasattribute, etc. ] of a concept, or a combination of involved concepts (evidence) +involved. +Our overall idea is similar to how [243] alludes to intelligence: “the ability +to decompose a problem into a set of skills and concepts, to reuse those skills +and concepts in new situations, or acquire new ones quickly”. The proposed +format would be a full-featured instruction tuning dataset, which has proven +very useful in other settings [404, 486] and which could be a valuable resource +for future research on instruction-based learning of already existing and future +DU tasks. +Naturally, all of this relies on the assumption that each question-answer pair +can be decomposed into skill-concept compositions, and that there exists an +exhaustive taxonomy of skills and concepts for DU, which thus far has not +been created. A possible approach would be to leverage existing resources +such as VerbNet [410] to define skills, or build an API for DocVQA similar to +[437, 535] to decompose questions into programs with subroutines, e.g., How +many of the contract’s pages have signatures? Counting([Navigation(document), +Existence(signature, page)]); and construct a complete taxonomy of document +concepts in both a bottom-up (human prior) and top-down (data-driven) fashion +to extend it over time with domain-specific concepts. Ideally, this taxonomy +should not be static at inference time, hinting at more research needed into +neuro-symbolic learning for dynamic knowledge graphs to assist in recognizing +and adding new concepts [32]. +There are several fundamental questions that can be asked here “Is it needed to + + \ No newline at end of file diff --git a/assets/txts/pg_0210.txt b/assets/txts/pg_0210.txt new file mode 100644 index 0000000000000000000000000000000000000000..c061f3e72b3b5bb0f772e68d891e84bdaeb94e36 --- /dev/null +++ b/assets/txts/pg_0210.txt @@ -0,0 +1,47 @@ +178 + +CONCLUSION + +collect thousands of QA pair examples to learn a specific document skill-concept +composition, e.g., address block detection?” Recent works seem to suggest +not, indicating an emergent ability of the current best LLMs to find zero-shot +solutions to a broad range of analogy problems [486]. Finally, building ground +truth more amenable to advanced prompting and instruction-based learning +[248] will likely prove as useful as question decomposition has in semantic +parsing [189, 358, 525]. +7.2.2.2 + +A Feature-complete IA-DU Solution? + +The main takeaway of this thesis is that while more compute, more data and more +powerful algorithmic tools have allowed significant progress in DU, there is still a +long way to go toward the objective of reliable, robust, realistic, and efficient DU. +For now, a major component would be a general-purpose Transformer-based +stack for interfacing with a document through natural language. Most likely, this +would be a multimodal LLM pretrained with a variety of pretraining objectives +on the richest and largest possible corpus of documents and related data. When +zero-shot performance is not sufficient, it would be instruction finetuned on +new QA pairs, e.g., in the rich format proposed in Section 7.2.2.1, resulting in +efficient adapters that can be served concurrently on the same prediction model +[417]. However, this is not a complete solution as generative modeling brings +additional challenges (e.g., expensive pretraining, decoding-based inference, +confidence estimation, dependence on human evaluation, scalability). +Instead, we will focus here on another component of a complete solution, namely +a failure forecaster, which we believe to be equally important for bringing +LLMs closer to real-world applications. We envision this to be a lightweight +module separate from the prediction model, that could be easily fully retrained +and updated with new data, bypassing the risk of catastrophic forgetting and +the need for retraining the more cumbersome LLM. The failure forecaster should +predict the performance of the LLM on a given input (document, question, +metadata etc.) and output (answer). It can be a very simple (e.g., logistic +regression) or complex model (e.g., a large DNN), yet most of its complexity +resides in the feature modeling and subsequent learning of sources of uncertainty. +Our failure forecaster design is informed by [114]. We non-exhaustively identify +sources of failure or uncertainty that can be modeled by the failure forecaster: +(i) input uncertainty, (ii) output uncertainty, and (iii) distributional metrics. +We discuss each of these in turn. +(i) Before answering any question, the document instance should be analyzed +for inherent uncertainty or quality issues: e.g., is it born-digital or OCR, the +quality of OCR, readability metrics to capture how easy the document text + + \ No newline at end of file diff --git a/assets/txts/pg_0211.txt b/assets/txts/pg_0211.txt new file mode 100644 index 0000000000000000000000000000000000000000..94bfe19ae8bcaeb0b27e9f670bcb2c103c057ed1 --- /dev/null +++ b/assets/txts/pg_0211.txt @@ -0,0 +1,32 @@ +PERSPECTIVES FOR FUTURE RESEARCH + +179 + +is to read, the complexity of the layout graph, visual richness. Next, follows +the question analysis: e.g., specificity, complexity, ambiguity, relevance, and +novelty. Each of these can be measured by heuristic approximations such as the +number of tokens or entities, how many of the entities literally appear in the +document, the number of possible answers, the context size required to answer +the question, the semantic overlap between the question and the document, how +similar is the question to training data questions, the grammatical correctness, +and syntactic complexity. Finally, the metadata analysis: e.g., the number of +documents in the same domain, the number of documents in the same type, the +number of documents in the same language. +(ii) The output uncertainty can be modeled by the confidence of the LLM in +its predicted answer, which can be estimated by PUQ methods and a variety +of CSFs [111], which are hypothesized to capture complementary sources of +uncertainty. Specific to the answer, the same question-document aspects return +here, with additionally how extractive the answer is, the answer structure, and +paraphrasing diversity. +(iii) Feature representations of new documents, questions, and answers can be +assessed relative to their individual and joint distance to the training distribution +[477]. This will be quintessential for distributional shift detection. +A failure forecaster trained to predict the performance of the LLM on all +this information can be used to decide whether to abstain from answering, +ask for clarifications from the model or human, ask for additional context, +demand question rephrasing or a more clear document input, or even additional +metadata. Ultimately, this will be useful to improve reliability and robustness +for real-world IA-DU applications, where the risk of failure demands substantial +control. + + \ No newline at end of file diff --git a/assets/txts/pg_0212.txt b/assets/txts/pg_0212.txt new file mode 100644 index 0000000000000000000000000000000000000000..8214d0ee079917c29e57d16e764fc46de8fb50bf --- /dev/null +++ b/assets/txts/pg_0212.txt @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/txts/pg_0256.txt b/assets/txts/pg_0256.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdba609a16293924b742c098662df4848084b755 --- /dev/null +++ b/assets/txts/pg_0256.txt @@ -0,0 +1,46 @@ +224 + +APPENDIX - PUQ + +20news and AAPD), retain the original document lengths, remapping tokens with +a frequency lower than 3 to UNK and PAD tokens are masked throughout. For +TextCNN 300-D embeddings are uniformly initialized upon which three different +kernels (3,4,5) operate with 100 feature maps per kernel followed by a max +pooling operation. For BERT we tokenize and encode using the standard BERT +tokenizer with maximum sequence length determined per dataset [20news: 250, +CLINC: 50, IMDB: 350 and Reuters/AAPD: 200]. +Following the MC Dropout procedure we apply dropout [431] with a rate of +0.5 after each non-linear weights layer. We found a global weight decay rate of +1e-4 [224, 293] to work well for TextCNN, whereas we disabled weight decay for +BERT since it overpenalized model complexity, resulting in vanishing gradients. +During training TextCNN, Adam optimizes cross-entropy or heteroscedastic +loss (see Section 3.3.2.4) with a learning rate of 1e-3 for 45 epochs on batches +of size 32. For fine-tuning BERT, we schedule the learning rate starting from +1e-5 to 1e-6 with batch size 16 and train for 20 epochs (longer than the original +recommendation, following [436]). We use early stopping conditioned on the +validation loss with sufficient epochs to ensure all models are trained until +convergence. Else the models might have learned to approximate well the mean +of the predictive posterior distribution, but not the variance. At evaluation +time, we estimate predictive mean and uncertainties by drawing T samples +from the approximated predictive posterior distribution or by averaging over +M models. We have empirically set T to 10 and for ensembles the number of +models M to 5. + +B +B.1 + +Practical Considerations +Take-home Summary + +Concretely, for a multi-class problem with a large number of classes, +incorporating input-dependent data uncertainty improves accuracy and novelty +detection. With high label cardinality in multi-label classification, we +recommend ensembling for more reliable epistemic uncertainty estimation. More +generally, we advise against using MC Dropout if the dropout rate and weight +regularization are not fine-tuned for the problem at hand, drawing parallels to +dropout probability rates adaptively learned with Concrete Dropout. +Hyperparameter considerations We reiterate important hyperparameters +and reasonable defaults for text classification tasks similar to our benchmark +setup and applications of the above. + + \ No newline at end of file diff --git a/assets/txts/pg_0257.txt b/assets/txts/pg_0257.txt new file mode 100644 index 0000000000000000000000000000000000000000..da614e414be9f7b9564e12f10e36bb61c4248a67 --- /dev/null +++ b/assets/txts/pg_0257.txt @@ -0,0 +1,63 @@ +PRACTICAL CONSIDERATIONS + +225 + +• Dropout rate p: the original work suggested a fixed binary rate (p=0.5), +whereas our experiments indicate different rates are more applicable per +dataset. It is best to cross-validate layer-wise dropout probabilities for +any real-world application, where impossible it warrants the low effort of +incorporating Concrete Dropout, consequently reducing experimentation +time. +• Weight decay L2: best to start with small values [1e-6 - 1e-4] and finetune accordingly. Take note to not apply global weight decay in case of +pretrained weights, which already have high weight magnitudes, possibly +impeding learning. +• MC Dropout T : a small number (T =10) of stochastic samples suffices, +if large number of classes, scale sub-linearly with K. T also applies to +the number of samples drawn to calculate heteroscedastic loss, so beware +increasing to too large values since it affects training compute. +• Ensemble size M : a total of (M =5) ensemble models is plenty, certainly +when combining with fine-tuned dropout rate at the individual model +level. + +B.2 + +Compute vs. Performance Trade-off + +Next to performance, practitioners are generally concerned with computational +and memory costs. [462] present similar concerns in the benchmarking of +uncertainty methods. Considering the cost of compute vs. storage, each +uncertainty method impacts both differently. Following [348], we present +computational and memory costs for evaluated methods symbolically (BigO), with m flops or storage for a trained model, l represents flops or storage for +the last layer, T denotes sampling or replications, and ι GP inducing points. +Table A.1. Compute and storage costs in Big-O notation [348] for uncertainty methods. + +Method +Baseline +MC (Concrete) Dropout +Heteroscedastic +Deep Ensemble +cSGMCMC +SNGP + +Compute/N +m +mT +m + l(T − 1) +mT +m +m + ι2 + +Storage +m +m +m(+l) +mT +mT +m + +Our experiments were carried out on a system with a Intel Core i7-10750H 2.6 +GHz CPU and NVIDIA GeForce RTX 2070 Max-Q GPU. +Additionally, we provide an informative table with training (Table A.2) and test +(Table A.3) timings provided over all single models on CLINC-OOS. + + \ No newline at end of file diff --git a/assets/txts/pg_0258.txt b/assets/txts/pg_0258.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f0f49e39dac023a41aed4d66ac204d7546c074f --- /dev/null +++ b/assets/txts/pg_0258.txt @@ -0,0 +1,217 @@ +226 + +APPENDIX - PUQ + +Table A.2. CLINC-OOS models with training timings (in seconds) per epoch and +total running time. +methods + +architecture + +Unregularized +Regularized +Heteroscedastic +Concrete Dropout +Heteroscedastic Concrete Dropout +Unregularized +Regularized +Heteroscedastic +Concrete Dropout +Heteroscedastic Concrete Dropout + +TextCNN +TextCNN +TextCNN +TextCNN +TextCNN +BERT +BERT +BERT +BERT +BERT + +train time/epoch + +epoch finished + +train runtime + +32 +32 +59 +35 +58 +420 +691 +710 +679 +707 + +8 +28 +17 +12 +10 +5 +11 +16 +9 +16 + +256 +896 +1003 +420 +580 +2100 +7601 +11360 +6111 +11312 + +Table A.3. CLINC-OOS models with inference timings presented in unit time for how +many batches or samples can be processed in 1 second wall-clock time over CPU and +GPU. For the short sequences of CLINC, both models allow a batch size of 32. +architecture + +method + +TextCNN +TextCNN +TextCNN +TextCNN +TextCNN +TextCNN +TextCNN +TextCNN +TextCNN +BERT +BERT +BERT +BERT +BERT +BERT +BERT +BERT +BERT + +Unregularized +Regularized +MC Dropout +Heteroscedastic +MC Heteroscedastic +Concrete Dropout +MC Concrete Dropout +Heteroscedastic Concrete Dropout +MC Heteroscedastic Concrete Dropout +Unregularized +Regularized +MC Dropout +Heteroscedastic +MC Heteroscedastic +Concrete Dropout +MC Concrete Dropout +Heteroscedastic Concrete Dropout +MC Heteroscedastic Concrete Dropout + +C +C.1 + +# batch (gpu) + +# sample (gpu) + +# batch (cpu) + +# sample (cpu) + +59.0 +66.0 +53.0 +693.0 +47.0 +66.0 +48.0 +756.0 +48.0 +6.0 +9.0 +0.9 +10.0 +1.0 +7.0 +1.0 +6.0 +0.9 + +1891 +2134 +1708 +22176 +1525 +2130 +1541 +24205 +1561 +223 +306 +28 +325 +31 +245 +30 +218 +30 + +63.0 +60.0 +32.0 +482.0 +38.0 +40.0 +25.0 +318.0 +27.0 +0.8 +0.8 +0.1 +0.8 +0.1 +0.9 +0.1 +0.9 +0.1 + +2043 +1922 +1050 +15444 +1216 +1293 +827 +10197 +874 +25 +26 +2 +26 +2 +27 +2 +27 +2 + +Detailed Experiment Results +Zoom-in Benchmark Evidence + +In this Subsection we report additional evidence in support of our results, which +did not suit the main manuscript. + +C.2 + +Absolute Benchmark Results + +Next to reporting critical differences to analyze the relative performance of +uncertainty methods, we also report results as summary statistics, following the + + \ No newline at end of file diff --git a/assets/txts/pg_0259.txt b/assets/txts/pg_0259.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f666d9cdde98b0029b89c19c4610ebc1b8b23c3 --- /dev/null +++ b/assets/txts/pg_0259.txt @@ -0,0 +1,22 @@ +DETAILED EXPERIMENT RESULTS + +227 + +Figure A.1. Comparison with NLL(↓) for dataset-specific differences in method +performance. + +methodology of [462]. Firstly, we report performance averaged over both runs +and datasets, with the standard deviation over datasets. We indicate the best +mean performance in bold. For various metrics the standard deviation is very +large, which shows that the average over datasets for our benchmark would be +a poor measure of central tendency. Since we benchmark on three multiclass +and two multilabel datasets, any aggregate would be biased towards multiclass +performance, hence why we specifically opted for rank and critical difference to +analyze relative performance of each method. +Additionally, we compute the performance averaged over datasets, with the +standard deviation over multiple runs for all individual models. All raw model +results are available at https://github.com/Jordy-VL/uncertainty-bench/ +tree/main/experiments/raw_results. We refer to the original paper for the +larger detail tables with results averaged over datasets and runs. + + \ No newline at end of file diff --git a/assets/txts/pg_0260.txt b/assets/txts/pg_0260.txt new file mode 100644 index 0000000000000000000000000000000000000000..845da8b3c9fa80ab0e06afd2010d995c3304ee79 --- /dev/null +++ b/assets/txts/pg_0260.txt @@ -0,0 +1,10 @@ +228 + +APPENDIX - PUQ + +Figure A.2. We report the Pearson Correlation Coefficient (PCC) between uncertainty +values and binary variable ID-OOD for Amazon product review datasets. A higher +absolute correlation score points to stronger association of uncertainty and out-ofdomain detection. *Model Uncertainty (MU), Data Uncertainty (DU), Mutual +Information (MI). + + \ No newline at end of file diff --git a/assets/txts/pg_0261.txt b/assets/txts/pg_0261.txt new file mode 100644 index 0000000000000000000000000000000000000000..6545c653c628da666d44bbd65205a2c0460ceb1d --- /dev/null +++ b/assets/txts/pg_0261.txt @@ -0,0 +1,50 @@ +DETAILED EXPERIMENT RESULTS + +229 + +(a) Heteroscedastic Ensemble - + +(b) MC Ensemble - S + +(c) Deep Ensemble Regularized +- MI + +(e) Deep Ensemble - MU + +(f) MC Dropout - MU + +(h) Concrete Dropout - S + +(i) MC Concrete Dropout - MU + +(k) Concrete Dropout Ensem- + +(l) Deep Ensemble - H + +H + +(d) Heteroscedastic CD Ensemble - S + +(g) Heteroscedastic Ensemble H + +(j) MC CD Ensemble - H + +ble - S + +(m) Deep Ensemble - MU + +(n) Heteroscedastic Ensemble - + +(o) MC Dropout - MI + +MI + +Figure A.3. A selection of most interesting Gaussian kernel density plots over +(abbreviated) model setup metrics evaluated on all datasets in row order 20news +(a-c), CLINC150 (d-f), imdb (g-i), Reuters (j-l), AAPD (m-o). Each plot captures +probabilistic density over correct ID (green), incorrect ID (red) and OOD (purple). +From left to right, we have selected a high rank, middle rank, and low-rank method and +uncertainty quantity combination. The density estimates demonstrate clear empirical +difference over all datasets for various uncertainty quantities. + + \ No newline at end of file diff --git a/assets/txts/pg_0262.txt b/assets/txts/pg_0262.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a71a115fe6c9f31bce80194223630427254210c --- /dev/null +++ b/assets/txts/pg_0262.txt @@ -0,0 +1,28 @@ +Appendix B + +Appendix - BDPC +A + +Existing DC Datasets + +As the datasets from Table 2 did not satisfy large-scale benchmarking multipage +DC benchmarking requirements, we discuss them in supplementary for interested +readers. +Tobacco-3482 [232] is another subset of IIT-CDIP with fewer samples and a +smaller label set than RVL-CDIP. +Tobacco-800 [553] has been used for page stream segmentation ([494], similarly +defined as in [328]) as it contains consecutively numbered multipage business +documents. +NIST The NIST Structured Forms Database [98] consists of 5,590 binary +synthesized documents from 20 different classes of tax forms. +MARG The MARG (Medical Article Records Groundtruth) database [290] is +a layout-based classification benchmark containing 1553 documents which are +mainly the first pages of medical journals. +TAB [328] is a recently introduced page stream segmentation dataset targeting +binary classification to detect document boundaries on multipage streams. It +consists of a sample of 44,769 PDF documents from the Truth Tobacco Industry +Documents (TTID) archives. + +230 + + \ No newline at end of file diff --git a/assets/txts/pg_0263.txt b/assets/txts/pg_0263.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc19247d48c441c05c55f3be64c4cfaae9abfb83 --- /dev/null +++ b/assets/txts/pg_0263.txt @@ -0,0 +1,19 @@ +VISUALIZATION OF PROPOSED DC DATASETS + +B + +231 + +Visualization of Proposed DC Datasets + +As we have contributed two novel datasets consisting of multipage documents in +PDF format, adding visualizations is non-trivial. The datasets are hosted at the +HuggingFace Hub (https://huggingface.co/datasets/bdpc), for which at +the time of submission, the dataset viewer does not support PDF data. Rather +than adding examples in the manuscript, which is tedious for PDF documents +with multiple pages, we have built an interactive app (https://huggingface. +co/spaces/jordyvl/viz_bdpc). This allows for the visualization of samples +from the proposed datasets, with an additional filter on the labels, whereas +both datasets follow the original RVL-CDIP label taxonomy. + + \ No newline at end of file diff --git a/assets/txts/pg_0264.txt b/assets/txts/pg_0264.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce6702adabaab08e00c1ce03f4625d7dd4643f92 --- /dev/null +++ b/assets/txts/pg_0264.txt @@ -0,0 +1,35 @@ +Appendix C + +Appendix - DUDE +A + +Baseline Experiments Setup + +In this Section, we describe the implementation details1 for the architectures +and inference methods used in our benchmark. + +A.1 + +Hyperparameter Defaults + +Refer to Table C.1. + +A.2 + +Generative LLM Prompt Fine-tuning + +The performance of GPT3.5 models was assessed in two settings: 0-shot and +4-shot. In the 0-shot setting, the prompt included instructions similar to those +provided to annotators to teach them how to annotate. In the 4-shot setting, the +prompt was enhanced with the content of a single document from the training +set along with four questions of different types (extractive, abstractive, list, and +not answerable) to better gauge the models’ abilities. +The 0-shot prompt is analogous to the 4-shot prompt, but the key distinction is +that it lacks the first document and the example question-and-answer pairs. +1 Main + +framework used: https://github.com/rubenpt91/MP-DocVQA-Framework + +232 + + \ No newline at end of file diff --git a/assets/txts/pg_0265.txt b/assets/txts/pg_0265.txt new file mode 100644 index 0000000000000000000000000000000000000000..22abbcbe88c7edbcd62d199740d98b944b61ab98 --- /dev/null +++ b/assets/txts/pg_0265.txt @@ -0,0 +1,111 @@ +BASELINE EXPERIMENTS SETUP + +233 + +Hyper-Parameter + +T5 + +T5+2D + +HiVT5 + +Epochs +Warm-up +(iterations) +Optimizer +Gradient acc. +Lower case +Max. Seq. Length +Generation +(Max. Tokens) +Batch size +Learning rate +Training time +(per epoch) + +10 + +10 + +10 + +1000 + +250 + +1000 + +Adam, AdamW +False +True +512, 8192 + +Adafactor +8 +True +512, 8192 + +Adam +False +True +20480 + +100 + +100 + +50 + +3 +1E-04, 2E-04 + +8 +2E-04 + +1 +2E-04 + +1h, 10h + +1.5h, 5h + +10h + +TITAN RTX, +A100 + +A100 +(80GB) + +TITAN RTX +(24GB) + +GPU Hardware + +Table C.1. Hyperparameters used for fine-tuning T5, T5-2D and HiVT5 on DUDE. +When two values are placed in a single column, they refer to the model’s versions with +512 and 8192 input sequence length, respectively. + +For the inference process, we utilized the prompt completion default settings +outlined in the OpenAI documentation, with the exception of the temperature +parameter, which was lowered to a value of 0.0. This adjustment was made +to ensure that the output would be more deterministic and focused, with less +emphasis on generating creative variations. +Only after our prompting experiments had been completed, we realized the +opportunity to assess confidence estimation using chained prompts (Please give +a confidence between 0 and 1 about how certain you are this is the answer.) as +in [219]. Since we did not save our dialogue states and considered the expenses, +we leave this for future work. + +A.3 + +Confidence Estimation + +This Subsection details confidence scoring functions for the baselines, as this is +not reported in standard practice. +We define confidence as the predicted probability of the top-1 prediction, often +arising as the largest value from softmax normalization of logits from a final +model layer (head). + + \ No newline at end of file diff --git a/assets/txts/pg_0266.txt b/assets/txts/pg_0266.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5c57c8e0899bf055b024481f2ddfee9e6b3b27a --- /dev/null +++ b/assets/txts/pg_0266.txt @@ -0,0 +1,61 @@ +234 + +APPENDIX - DUDE + +Encoder-based models will output logits for all possible start and end positions +of the answer within the provided context. While the predicted answer of such +a span prediction architecture will come from the highest valid (no negative +span) combination of the sum of a start and end logit, the predicted answer +confidence can be obtained by the following procedure (BS: batch size and S: +sequence length): +% # Standard span prediction forward call +% outputs = model(**inputs, start_positions=start_positions, +,→ +end_positions=end_positions) +% +% +% +% +% + +# Assumes masking all padding and special tokens after softmax with 0 +start = outputs.start_logits.softmax(dim=1) +.unsqueeze(dim=0).unsqueeze(dim=-1) #1 x BS x S x 1 +end = outputs.end_logits.softmax(dim=1) +.unsqueeze(dim=0).unsqueeze(dim=1) #1 x BS x 1 x S + +% # Compute the probability of each valid (end < start) start, end pair +% candidate_matrix = torch.matmul(start, end).triu().detach().numpy() # 1 x BS x +,→ +S x S +# Obtain highest scoring candidate span by multi-index argmax +flat_probs = candidate_matrix.reshape((1, -1)) # BS x S*S +batch_idx, start_idx, end_idx = np.unravel_index(np.argmax(flat_probs, 1), +,→ +candidate_matrix.shape)[1:] +batch_answer_confs = candidate_matrix[0, batch_idx, start_idx, end_idx] + +Decoder-based models are not restricted to spans and can output an arbitrary, +though often controllable, amount of text tokens, indicated as S 0 . The logits +at the final layer take the shape of BS × S 0 × V , where V is the tokenizer’s +vocabulary size (32.1K for T5-base). The following confidence estimation +procedure is applied for decoder outputs: +# Standard decoder-based greedy forward pass (without labels) +outputs = model.generate(**input_ids, output_scores=True, +,→ +return_dict_in_generate=True) +% # BS x S' x V, dropping EOS token and applying softmax + argmax per token +% batch_logits = torch.stack(outputs.scores, dim=1)[:, :-1, :] +% decoder_outputs_confs = torch.amax(batch_logits.softmax(-1), 2) +% # Remove padding from batching decoder output of variable sizes +% decoder_outputs_confs_masked = torch.where( +% +outputs.sequences[:, 1:-1] > 0, +% +decoder_outputs_confs, +% +torch.ones_like(decoder_outputs_confs)) +# Multiply probability over tokens +batch_answer_confs = decoder_outputs_confs_masked.prod(1) + + \ No newline at end of file diff --git a/assets/txts/pg_0267.txt b/assets/txts/pg_0267.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb1a96adf3746f044adc6a7f8330fe72c7553cb9 --- /dev/null +++ b/assets/txts/pg_0267.txt @@ -0,0 +1,24 @@ +QUALITATIVE EXAMPLES + +A.4 + +235 + +Evaluation + +All metric implementations (ANLS, ECE, AURC) are made available as +a standalone repository. Additionally, we provide an online service where +researchers can evaluate their methods against a blind (questions-only) test +dataset. General metric descriptions are provided in Section 2.2.3 with additional +implementation details and motivated design choices. While ANLS can account +for shortcomings of OCR and formatting issues, evaluation of generated text is +notoriously complex [377] and requires more research. + +B + +Qualitative Examples + +As is customary, we provide some interesting, handpicked test set examples +with predictions from some of the baselines in our study. + + \ No newline at end of file diff --git a/assets/txts/pg_0268.txt b/assets/txts/pg_0268.txt new file mode 100644 index 0000000000000000000000000000000000000000..95417ae1da4d55ceaa154043fd8fffc8c3aa6b72 --- /dev/null +++ b/assets/txts/pg_0268.txt @@ -0,0 +1,62 @@ +236 + +APPENDIX - DUDE + +Low complexity. Where the document has been printed? +Simple, extractive question, plain-text evidence. + +Source + +Answer + +Ground truth +Human + +New Delhi, India +India + +T5 +ChatGPT + +IS : 9304 - 1979 +The document does +not mention where +it has been printed. +Bela Pack n Print. +New Delhi, India +New Delhi, India +Page 1 +new delhi, india + +GPT3 +T5-2D +HiVT5 +Longformer + +ANLS + +Conf. + +0.0 + +— + +0.0 +0.0 + +0.56 +— + +0.0 + +— + +1.0 +0.0 +1.0 + +0.09 +0.18 +0.72 + + \ No newline at end of file diff --git a/assets/txts/pg_0269.txt b/assets/txts/pg_0269.txt new file mode 100644 index 0000000000000000000000000000000000000000..012c1812417e2eca95882487d4cbbb8b423dd96b --- /dev/null +++ b/assets/txts/pg_0269.txt @@ -0,0 +1,108 @@ +QUALITATIVE EXAMPLES + +237 + +High complexity. Is there any redacted section on the document? +Abstractive question that requires knowledge about possible document elements. + +Source + +Answer + +Ground truth +Human + +No +No + +ANLS + +Conf. + +1.0 + +— + +T5 +ChatGPT +GPT3 +T5-2D +HiVT5 +LayoutLMv3 + +yes +[Not-answerable] +[Not-answerable] +No +Yes +approved for release + +0.0 +0.0 +0.0 +1.0 +0.0 +0.0 + +0.17 +— +— +0.43 +0.55 +0.01 + +Requires arithmetic. What is the difference between how much Operator II +and Operator III makes per hour? +The question requires table comprehension, determining relevant values, dividing +extracted integers, and correcting the subject-verb agreement. + +Source + +Answer + +Ground truth +Human + +$5 +$5 + +T5 +ChatGPT +GPT3 + +200 +$5 per hour. +Operator II ($17/hr) +| +Operator +III +($22/hr) +[Not-answerable] +[Not-answerable] + +T5-2D +HiVT5 + +ANLS + +Conf. + +1.0 + +— + +0.0 +0.0 +0.0 + +0.28 +— +— + +0.0 +0.0 + +0.31 +0.15 + + \ No newline at end of file diff --git a/assets/txts/pg_0270.txt b/assets/txts/pg_0270.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce0a31bd405cbab0981b15cc6ee12a4653fc5216 --- /dev/null +++ b/assets/txts/pg_0270.txt @@ -0,0 +1,62 @@ +238 + +APPENDIX - DUDE + +Visual evidence (chart). What is the maximum percentage of the blue graph +line on page 8? +A highly demanding question that requires simultaneous competency of visual +comprehension (locating chart and line color), navigating through layout +(determining adequate page), and numerical comparison (deciding on the highest +value). + +Source + +Answer + +Ground truth +Human + +75% +75 + +ANLS + +Conf. + +0.7 + +— + +T5 +ChatGPT +GPT3 +T5-2D +HiVT5 +BigBird +LayoutLMv3 + +76 +[Not-answerable] +76% +32.0 +45% +32 +80% + +0.0 +0.0 +0.7 +0.0 +0.7 +0.0 +0.0 + +0.25 +— +— +0.00 +0.05 +0.47 +0.15 + + \ No newline at end of file diff --git a/assets/txts/pg_0271.txt b/assets/txts/pg_0271.txt new file mode 100644 index 0000000000000000000000000000000000000000..1367c584c4473e779406cc759a5b40f3c63cf680 --- /dev/null +++ b/assets/txts/pg_0271.txt @@ -0,0 +1,58 @@ +QUALITATIVE EXAMPLES + +239 + +Visual evidence (handwriting). + +What is the handwritten date on page 1? + +The question requires visual comprehension (recognition of handwriting) and +layout navigation (determining the adequate page). + +Source + +Answer + +Ground truth +Human + +13-XII-50 +13-XII-50 + +ANLS + +Conf. + +1.0 + +— + +T5 +ChatGPT +GPT3 +T5-2D +HiVT5 +BERTQA + +1977-01-01 +[Not-answerable] +15 December 1950 +1950-12-15 +1977-07-01 +2006 / 1 + +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 + +0.24 +— +— +0.24 +0.11 +0.5 + + \ No newline at end of file diff --git a/assets/txts/pg_0272.txt b/assets/txts/pg_0272.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d4c3aad835749104d520a8b87b3780ebd9a7a8e --- /dev/null +++ b/assets/txts/pg_0272.txt @@ -0,0 +1,51 @@ +240 + +APPENDIX - DUDE + +Requires counting. How many pages have a signature? +The question requires visual comprehension (recognition of signature), knowledge +about layout, and counting. + +Source + +Answer + +ANLS + +Conf. + +Ground truth +Human + +2 +2 + +1.0 + +— + +T5 +ChatGPT +GPT3 +T5-2D +HiVT5 + +1 +4 +[Not-answerable] +4 +4 + +0.0 +0.0 +0.0 +0.0 +0.0 + +0.01 +— +— +0.69 +0.41 + + \ No newline at end of file diff --git a/assets/txts/pg_0273.txt b/assets/txts/pg_0273.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa0d6fadfe95953c293fa7962933003b47a21708 --- /dev/null +++ b/assets/txts/pg_0273.txt @@ -0,0 +1,65 @@ +QUALITATIVE EXAMPLES + +241 + +Visual evidence (map), multi-hop. Which states don’t have any marijuana +laws? +The multi-hop question requires visually comprehending the map and linking +knowledge from its legend with depicted regions. + +Source + +Answer + +Ground truth +Human + +ID | SD | KS +ID | SD | KS + +T5 + +WA ME MT ND +MN OR VT ID NH +SD WI NY MA MI +[Not-answerable] +American Samoa +i +- + +ChatGPT +GPT3 +T5-2D +HiVT5 + +B.1 + +ANLS + +Conf. + +1.0 + +— + +0.0 + +0.28 + +0.0 +0.0 +0.0 +0.0 + +——0.03 +0.02 + +Qualitative Examples - Competition + +We provide some interesting, hand-picked test set examples with predictions +from the submitted competition methods. +Low complexity. Who is the president and vice-chancellor? Despite +the question’s relatively straightforward nature, some systems struggle with +providing the appropriate answer. One can hypothesize it is the result of limited + + \ No newline at end of file diff --git a/assets/txts/pg_0274.txt b/assets/txts/pg_0274.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9bdcc43c6228d5240e39e1ac8e59367a82b43ee --- /dev/null +++ b/assets/txts/pg_0274.txt @@ -0,0 +1,135 @@ +242 + +APPENDIX - DUDE + +context (the answer is located at the end of the document), i.e., models either +hallucinate a value or provide a name found earlier within the document. +Source + +Answer + +Ground truth +Human + +Jack N. Lightstone +Jack N. Lightstone + +ANLS + +Conf. + +1.0 + +— + +T5-base +MMT5 +UDOP+BLIP2+GPT +HiVT5+modules + +James L. Turk +james l. turk +jack n. lightstone +Jack N. Whiteside + +0.0 +0.0 +1.0 +0.6 + +0.0 +1.0 +0.9 +0.6 + +Requires graphical comprehension. Which is the basis for jurisdiction? +To provide a valid answer, the model needs to comprehend the meaning of +the form field and recognize the selected checkbox. None of the participating +systems was able to spot the answer correctly. + +Source + +Answer + +Ground truth + +U.S. Goverment Plaintiff +U.S. Goverment Plaintiff + +Human +T5-base +MMT5 + +Declaration of taking +united states district +court + +HiVT5+modules +UDOP+BLIP2+GPT public purpose + +ANLS + +Conf. + +1.0 + +— + +0.0 +0.0 + +0.1 +1.0 + +0.0 +0.0 + +1.0 +0.4 + +Requires comparison. In which year does the Net Requirement exceed +25,000? The question requires comprehending a multipage table and spotting if +any values fulfill the posed condition. Some of the models resort to plausible +answers (one of the three dates that the document covers), whereas others +correctly decide there is no value exceeding the provided amount. +Source + +Answer + +Ground truth +Human + +[Unanswerable] +[Unanswerable] + +ANLS + +Conf. + +1.0 + +— + +T5-base +MMT5 +UDOP+BLIP2+GPT +HiVT5+modules + +[Unanswerable] +2018 +[Unanswerable] +2017 + +1.0 +0.0 +1.0 +0.0 + +0.2 +1.0 +1.0 +0.8 + +Requires arithmetic. What is the difference between how much Operator II + + \ No newline at end of file diff --git a/assets/txts/pg_0275.txt b/assets/txts/pg_0275.txt new file mode 100644 index 0000000000000000000000000000000000000000..46773fdaa7b8796d44596d3b4448015ae817d09d --- /dev/null +++ b/assets/txts/pg_0275.txt @@ -0,0 +1,92 @@ +QUALITATIVE EXAMPLES + +243 + +and Operator III make per hour? The question requires table comprehension, +determining relevant values, and dividing extracted integers. None of the +participating models was able to fulfill this requirement. +Source + +Answer + +Ground truth +Human + +$5 +$5 + +ANLS + +Conf. + +1.0 + +— + +T5-base +MMT5 +UDOP+BLIP2+GPT +HiVT5+modules + +$0.00 +65% +-1.5 mile +$5,700.00 + +0.0 +0.0 +0.0 +0.0 + +0.0 +1.0 +0.0 +0.4 + +Requires counting and list output. What are the first two behavioral and +intellectual disabilities of people with FASDs? It seems most of the models +correctly recognized that this type of question requires a list answer but either +failed to comprehend the question or provided a list with incorrect length +(incomplete or with too many values). +Source + +Answer + +Ground truth +Human + +Learning disabilities | Hyperactivity +learning disabilities + +T5-base +MMT5 + +Early embryo brain development | External Genitals +heart beats | difficulty with attention | lung function +| hyperactivity | problem with judgment | speech and +language delays +UDOP+BLIP2+GPT hyperactivity | speech and language delays +HiVT5+modules +HIV/AIDS + +ANLS + +Conf. + +0.5 + +— + +0.0 +0.2 + +0.0 +1.0 + +0.5 +0.0 + +0.2 +0.6 + + \ No newline at end of file diff --git a/assets/txts/pg_0276.txt b/assets/txts/pg_0276.txt new file mode 100644 index 0000000000000000000000000000000000000000..18669c134f8f4a73b51b4cc4cdd5a1ce937fb46e --- /dev/null +++ b/assets/txts/pg_0276.txt @@ -0,0 +1,33 @@ +Appendix D + +Appendix - KDD +A + +Code and Datasets + +The proposed KD-VDU experimentation framework is available as linked +in the main manuscript. This includes the DIC benchmarking that is made +fully compatible with HuggingFace transformers, even allowing arbitrary image +classification models and (document) image datasets from HuggingFace hub. +The DLA benchmark is built around the Detectron2 framework, with +additional scripts for efficiency evaluation, visualization, and document data +preparation for downstream tasks. Downstream task experiments are made +available as a fork of the original LATIN-prompt [482] implementations with +additional modifications (4-bit quantization, question type ANLS evaluation, +InfographicsVQA dataloader, structure-preserving OCR respecting DLA +tokens). + +B + +Implementation Details + +DIC All runs are documented with hyperparameter configuration and +commandline arguments in a wandb project for complete transparency in +experiment results and reproducibility. +For RVL-CDIP, both teacher and student training is carried out for 10 epochs +with a batch size of (32 ViT, 64 ResNet) and AdamW with weight decay 5e-4 +and a learning rate of 1e-4 with a linear warmup of 10%. For Tobacco-3482, +the default recipe is similarly trained for 100 epochs. All experiments were +244 + + \ No newline at end of file diff --git a/assets/txts/pg_0277.txt b/assets/txts/pg_0277.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa6479428b805acf2c46d22c97e6748892039393 --- /dev/null +++ b/assets/txts/pg_0277.txt @@ -0,0 +1,88 @@ +IMPLEMENTATION DETAILS + +245 + +performed on a single NVIDIA GeForce RTX 3090 GPU (24GB GPU vRAM). +For some feature-based KD methods, the batch size was necessarily lowered to 16 +due to memory constraints. KD method hyperparameters were cross-validated +to find the best performing configuration for each method, and are listed in the +main manuscript result tables. +DLA In this paper, MaskRCNN detection architecture is considered with two +different backbones (1) CNNs: ResNet50 and ResNet101 (2) Transformers: ViT +base and ViT tiny. All the detection models are trained with Detectron2 [499] +which uses the PyTorch deep learning library. The hyperparameters used are +the following: (a) learning rate of 1e-4 (b) iterations 300k (c) optimizer: Adam +(d) batch size: 16 (e) ROI heads predictions: 128 (f) NMS threshold: 0.4 (g) +confidence threshold: 0.6 For reproducibility, we share the exact config files +used for each experiment as part of the Supplementary, +Teacher and student model variants Tables D.1 and D.2 indicate the +differences between used teacher and student models in terms of parameterization +and efficiency. +Table D.1. Details of Vision Transformer model variants [101]. +Variants +Tiny (T) +Small (S) +Base (B) + +Layers +12 +12 +12 + +Settings of D/ViT +Width FFN Heads +192 +768 +3 +384 +1536 +6 +768 +3072 +12 + +#Param +5.5M +21.7M +85.8M + +Table D.2. Details of the efficiency of model checkpoints considered in this work. + +Model +microsoft/resnet-101 +microsoft/resnet-50 +google/vit-base-patch16-224 +microsoft/dit-base +WinKawaks/vit-small-patch16-224 +WinKawaks/vit-tiny-patch16-224 + +GFLOPs +15.65 +8.21 +35.15 +35.15 +9.21 +2.51 + +GMACs +7.8 +4.09 +17.56 +17.56 +4.6 +1.25 + +Params (M) +42.5 +23.51 +86.39 +85.81 +21.81 +5.56 + +Downstream We extended the implementation of [482] to incorporate Llama-2 +[452] and build a similar dataloader for InfographicsVQA [310]. To enable strict +compatibility, we used the same unified OCR format, DUE [47], for all datasets. +This facilitated easy incorporation of DLA tokens into the OCR tokens without + + \ No newline at end of file diff --git a/assets/txts/pg_0278.txt b/assets/txts/pg_0278.txt new file mode 100644 index 0000000000000000000000000000000000000000..87092c6b36f2544c99b9a21e07a40d75d44a07df --- /dev/null +++ b/assets/txts/pg_0278.txt @@ -0,0 +1,44 @@ +246 + +APPENDIX - KDD + +disrupting the logic behind the original layout-aware representation of document +text. As it involved zero-shot evaluation, no finetuning was attempted for this +task, and while it could be left for future work, we want to iterate that we +sought to explore the innate ability of LLMs to ingest DLA-enriched prompts, +and not the downstream task performance itself. + +C + +Task Definitions + +The definitions have been incorporated as part of the fundamentals. Here we +will only point to details that are not included in the main manuscript. +To place each task in the context of document inputs, we define the following +tasks and their respective inputs with common notation. We follow notation +established in [470] for document page inputs. +A page p consists of an image v ∈ RC×H×W (number of channels, height, +T +and width, respectively) with T word tokens u = {wt }t=1 organized according + 1 1 2 2 T +to a layout structure s = xt , yt , xt , yt t=1 , typically referred to as token +bounding boxes, coming from OCR or available from a born-digital document. +DIC As a prototypical instance of classification [472] the goal is to learn an +estimator f : X → Y using N supervised input-output pairs (X, Y ) ∈ X × Y +drawn i.i.d. from an unknown joint distribution P (X, Y ). In the context of +DIC, the input space X is the set of all document images, and the output space +Y is the set of all document classes (e.g., invoice, email, form, advertisement, +etc.). The goal is to learn a function f that maps a document image x ∈ X to +a document class y ∈ Y, such that f (x) = y. Covariate shift [418] occurs when +the input distribution P (X) changes between the training and evaluation sets, +but the conditional distribution P (Y |X) remains the same. Put plainly, both +sets share the same document classes, yet the visual appearance, layout and +content of the document images can be different. For example, RVL-CDIP [241] +contains more modern documents with color, whereas all RVL-CDIP documents +are greyscale. +DLA The task of DLA can be formulated as a function that processes a +document image input and outputs structured information about its logical +layout elements (e.g., text blocks, headers, figures, charts, plots, tables). Let +DLA(x) represent the output predictions of the DLA process as a set of tuples, + + \ No newline at end of file diff --git a/assets/txts/pg_0279.txt b/assets/txts/pg_0279.txt new file mode 100644 index 0000000000000000000000000000000000000000..1523019ef36b4ee99c269c699be759260653b535 --- /dev/null +++ b/assets/txts/pg_0279.txt @@ -0,0 +1,98 @@ +ADDITIONAL EXPERIMENT RESULTS + +247 + +where each tuple (bj , cj , pj ) represents one of J detected logical layout element. +J + +DLA(x) = {(bj , cj , mj )}j=1 + +(D.1) + +For each, bj denotes the bounding box for the j-th detected element, defined +as (xj , yj , wj , hj ) (in the popular COCO format). cj is the class label for the +j-th element, indicating its object category. mj is a set of additional properties +or information (metadata attributes, predicted scores, considered optional) +associated with the j-th element, which can vary depending on the type and +context of the layout components. +Zero-shot Document Visual Question Answering Given a document d and +a question q, the goal of zero-shot DocVQA is to predict the answer a to the +question q from the document, assuming a single document image for simplicity. +Following the text-only LLM approach in [482], each document image requires +to be translated to text, either from OCR or from a born-digital document, and +the question is translated to a prompt p. The prompt p is a sequence of tokens +that is fed to the LLM model, together with a potential task instruction, and +the document image text D, which is structured following a heuristic procedure +operating on the text tokens (T ) and respective bounding boxes (see Table 6.2). + +D + +Additional Experiment Results + +Table D.3. Results of different KD strategies benchmarked for ResNets applied on the +RVL-CDIP dataset. +Dataset +RVL-CDIP +RVL-CDIP1k +RVL-CDIP1k +RVL-CDIP1k +RVL-CDIP1k +RVL-CDIP1k +RVL-CDIP1k + +Teacher +ResNet-101 +– +ResNet-101 +ResNet-101 +ResNet-101 +ResNet-101 +ResNet-101 +ResNet-101 + +Student +– +ResNet-50 +ResNet-50 + +Method +Baseline +Baseline +Vanilla [τ = 2.5, α = 0.5] +NKD [τ = 1, γ = 1.5] +MSE +SimKD [∅ projector] +SimKD [CNN] +FitNet [middle] + +ACC +0.819 +0.783 +0.783 +0.785 +0.786 +0.769 +0.797 +0.758 + +AURC +0.043 +0.059 +0.059 +0.063 +0.058 +0.067 +0.053 +0.087 + +ECE +0.017 +0.039 +0.039 +0.073 +0.032 +0.025 +0.023 +0.178 + + \ No newline at end of file diff --git a/assets/txts/pg_0280.txt b/assets/txts/pg_0280.txt new file mode 100644 index 0000000000000000000000000000000000000000..699e666f5ab8f086fd4885d668f634ee11f4122d --- /dev/null +++ b/assets/txts/pg_0280.txt @@ -0,0 +1,116 @@ +248 + +APPENDIX - KDD + +Table D.4. Results of different KD strategies benchmarked for ResNets applied on the +Tobacco-3482 dataset. + +Student +– +ResNet-50 + +Method +Teacher +CE +CE+KD +NKD +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet + +ACC +0.445 +0.552 +0.667 +0.436 +0.399 +0.176 +0.314 +0.577 + +ECE +0.102 +0.096 +0.127 +0.076 +0.083 +0.250 +0.103 +0.085 + +AURC +0.360 +0.256 +0.149 +0.330 +0.379 +0.768 +0.429 +0.219 + +Table D.5. Results of different KD strategies benchmarked for ViT-B applied on the +Tobacco-3482 datasets. + +Student +ViT-S + +ViT-T + +Method +Teacher +CE +CE+KD +NKD +MSE +SimKD [CNN] +FitNet +NKD +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet + +ACC +0.876 +0.783 +0.814 +0.803 +0.807 +0.836 +0.821 +0.792 +0.798 +0.811 +0.810 +0.805 + +ECE +0.082 +0.096 +0.072 +0.094 +0.161 +0.125 +0.151 +0.064 +0.198 +0.599 +0.135 +0.160 + +AURC +0.040 +0.071 +0.063 +0.066 +0.062 +0.072 +0.059 +0.069 +0.074 +0.065 +0.081 +0.070 + + \ No newline at end of file diff --git a/assets/txts/pg_0281.txt b/assets/txts/pg_0281.txt new file mode 100644 index 0000000000000000000000000000000000000000..c041f39cf9c321030da2b3eb2103d317d2257ca6 --- /dev/null +++ b/assets/txts/pg_0281.txt @@ -0,0 +1,149 @@ +ADDITIONAL EXPERIMENT RESULTS + +249 + +Table D.6. Results of different KD strategies benchmarked for DiT-B applied on the +Tobacco-3482 dataset. + +Student +ViT-S + +ViT-T + +Method +Teacher +CE +CE+KD +NKD +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet +CE +CE+KD +NKD +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet + +ACC +0.916 +0.820 +0.825 +0.813 +0.818 +0.829 +0.810 +0.827 +0.810 +0.816 +0.807 +0.811 +0.778 +0.783 +0.793 + +ECE +0.109 +0.081 +0.086 +0.101 +0.090 +0.153 +0.144 +0.152 +0.066 +0.078 +0.087 +0.072 +0.162 +0.187 +0.168 + +AURC +0.020 +0.059 +0.064 +0.055 +0.063 +0.056 +0.062 +0.067 +0.065 +0.065 +0.063 +0.061 +0.093 +0.079 +0.077 + +Table D.7. Results for DLA-KD experiments on PRImA dataset. + +Teacher + +Student + +Method + +mAP + +Vit-B +Resnet-101 +- + +ViT-T +Resnet-50 + +Teacher +Teacher +Baseline +Baseline + +36.01 +38.34 +32.64 +35.61 + +Resnet-101 + +Resnet-50 + +Vit-B + +ViT-T + +SimKD +ReviewKD +SimKD +ReviewKD + +35.00 +34.31 +32.05 +31.94 + +D.1 + +Tobacco-3482 Results + +D.2 + +PRImA Results + +D.3 + +RVL-CDIP-N Results + +D.4 + +Downstream DocVQA Results + +D.5 + +Ablation Experiments + +The experiments with random student weight initialization (Tables D.12 +and D.13) show that ViTs suffer more from student weight initialization, which is + + \ No newline at end of file diff --git a/assets/txts/pg_0282.txt b/assets/txts/pg_0282.txt new file mode 100644 index 0000000000000000000000000000000000000000..68df93f8252c502bffc8e41c9667118846f2dfd0 --- /dev/null +++ b/assets/txts/pg_0282.txt @@ -0,0 +1,61 @@ +250 + +APPENDIX - KDD + +Table D.8. Evaluation including relative runtime of KD methods on RVL-CDIP-N, +where from left-to-right results are grouped per KD strategy, per backbone, per student +size. + +Table D.9. Results for KD methods when averaged over architectures and student +sizes on RVL-CDIP-N. + +KD method + +ACC + +ECE + +AURC + +Teacher +CE +CE+KD +NKD +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet + +0.611 +0.573 +0.519 +0.524 +0.490 +0.613 +0.629 +0.534 + +0.120 +0.119 +0.184 +0.137 +0.205 +0.202 +0.273 +0.281 + +0.152 +0.215 +0.298 +0.259 +0.308 +0.216 +0.197 +0.246 + +evidenced by an average accuracy of 0.5962 for ViT-S/Trand compared to 0.7675 +for R50rand . When the student initialization is not dependent on pretraining, +NKD pops up as a performant method, showing the versatility of response-based +methods when transfer of feature representations is harder. + + \ No newline at end of file diff --git a/assets/txts/pg_0283.txt b/assets/txts/pg_0283.txt new file mode 100644 index 0000000000000000000000000000000000000000..d23da2310a7bd817dfbce2213500bdb92a062bff --- /dev/null +++ b/assets/txts/pg_0283.txt @@ -0,0 +1,583 @@ +ADDITIONAL EXPERIMENT RESULTS + +251 + +Table D.10. Validation ANLS (scaled to %) of Llama-2-7b-chat [452] on SP-DocVQA +[309], with a KD-DLA model enriching the prompt. +prompt +plain +space +task ++DLA + +task_space ++DLA + +DLA + +ANLS Image/Photo Yes/No Figure/diagram Form Free_text Handwritten Layout Others Table/list + +Resnet-101 +Resnet-101 +Resnet-50 ReviewKD +Resnet-50 SimKD +Vit-B +Vit-T +Vit-T ReviewKD +Vit-T SimKD +Resnet-101 +Resnet-50 +Resnet-50 ReviewKD +Resnet-50 SimKD +Vit-B +Vit-T +Vit-T ReviewKD +Vit-T SimKD + +4.3 +4.61 +57.63 +57.76 +57.55 +57.76 +57.53 +58.39 +58.65 +57.96 +58.58 +62.46 +61.86 +62.08 +62.14 +61.95 +61.2 +58.65 +61.58 +61.46 + +4.25 +2.97 +45.38 +43.31 +44.44 +43.31 +45.45 +44.43 +44.7 +45.9 +45.09 +42.95 +41.51 +39.62 +44.09 +43.93 +44.58 +44.7 +46.25 +44.79 + +5.36 +0.0 +51.52 +47.02 +49.4 +47.02 +51.52 +41.67 +50.3 +47.32 +49.43 +49.43 +48.24 +49.13 +42.26 +44.97 +49.13 +50.3 +46.75 +48.24 + +1.46 +1.25 +34.97 +35.01 +34.0 +35.01 +35.28 +34.81 +36.19 +33.49 +34.92 +40.93 +40.63 +42.4 +40.39 +40.57 +40.28 +36.19 +37.84 +40.25 + +2.69 +3.31 +67.88 +66.84 +66.99 +66.84 +67.39 +66.38 +67.65 +66.68 +67.28 +71.15 +71.12 +71.27 +70.6 +71.02 +68.95 +67.65 +69.37 +69.55 + +8.99 +7.55 +69.71 +70.03 +68.64 +70.03 +68.73 +67.82 +68.0 +68.92 +70.64 +70.59 +69.39 +70.37 +69.69 +70.12 +68.39 +68.0 +69.27 +69.95 + +1.74 +2.14 +53.19 +52.27 +51.97 +52.27 +52.23 +52.1 +52.49 +51.15 +52.19 +55.87 +54.56 +54.43 +53.07 +54.95 +52.81 +52.49 +53.86 +53.15 + +6.1 +6.48 +55.51 +57.16 +56.52 +57.16 +56.71 +59.19 +59.29 +58.46 +58.44 +61.87 +61.38 +61.54 +61.8 +61.43 +61.38 +59.29 +61.5 +61.0 + +7.72 +8.45 +55.78 +58.77 +58.23 +58.77 +56.5 +55.91 +57.03 +56.32 +57.68 +61.05 +58.62 +59.86 +60.14 +60.74 +56.44 +57.03 +58.44 +58.18 + +1.87 +2.59 +53.81 +52.22 +52.64 +52.22 +52.2 +52.79 +52.72 +51.89 +52.82 +58.31 +57.48 +57.59 +58.29 +57.69 +56.7 +52.72 +57.63 +57.05 + +Table D.11. Validation ANLS (scaled to %) of Llama-2-7b-chat [452] on +InfographicsVQA [310], with a KD-DLA model enriching the prompt. +prompt +plain +space +task ++DLA + +task+space ++DLA + +DLA + +Resnet-50 +Resnet-101 +Resnet-50 ReviewKD +Resnet-50 SimKD +Vit-B +Vit-T +Vit-T ReviewKD +Vit-T SimKD +Resnet-50 +Resnet-101 +Resnet-50 ReviewKD +Resnet-50 SimKD +Vit-B +Vit-T +Vit-T ReviewKD +Vit-T SimKD + +ANLS Arithmetic Comparison Counting Figure Map Multi-span Non-extractive Question span Single span Table/list Text Visual/layout +0.81 +0.69 +29.08 +27.94 +27.86 +28.16 +27.65 +28.36 +28.32 +28.23 +28.18 +27.97 +27.14 +28.08 +28.07 +27.68 +28.05 +27.0 +28.47 +27.97 + +0.0 +0.0 +14.15 + +0.0 +0.0 +26.94 + +0.23 +0.0 +11.35 + +0.42 0.0 +0.32 0.0 +27.52 19.1 + +0.93 +0.9 +19.79 + +0.12 +0.0 +12.79 + +0.64 +0.53 +48.44 + +0.98 +0.86 +33.79 + +1.0 +1.08 +26.17 + +1.93 +1.55 +35.24 + +0.47 +0.0 +26.39 + +14.1 +12.12 +13.33 +13.79 +14.93 +15.06 + +26.21 +24.96 +25.81 +25.78 +29.15 +28.02 + +10.28 +11.35 +12.05 +9.95 +7.64 +9.58 + +26.19 +26.32 +26.39 +26.16 +27.05 +27.25 + +20.25 +18.82 +22.11 +19.53 +19.0 +19.01 + +17.7 +18.32 +21.06 +18.78 +19.41 +17.0 + +12.28 +11.93 +12.93 +11.97 +11.21 +11.82 + +45.14 +44.81 +46.95 +45.95 +46.87 +45.67 + +32.7 +32.62 +32.42 +32.17 +33.35 +33.48 + +24.79 +24.51 +25.02 +24.31 +25.56 +25.02 + +34.3 +33.89 +34.18 +33.8 +34.59 +34.81 + +26.96 +25.94 +26.86 +26.31 +26.69 +28.33 + +13.35 +14.82 +9.78 +8.12 +9.49 +9.59 + +27.7 +26.31 +25.13 +23.78 +24.31 +24.18 + +10.78 +9.6 +6.99 +6.27 +8.04 +8.41 + +26.39 +26.19 +25.93 +24.68 +25.88 +25.88 + +20.03 +18.96 +21.04 +18.67 +19.72 +18.67 + +20.4 +18.09 +22.33 +19.26 +21.01 +21.37 + +11.92 +12.51 +8.2 +7.0 +8.63 +9.01 + +45.95 +45.36 +43.36 +41.95 +41.23 +42.86 + +32.95 +32.87 +33.53 +33.03 +33.77 +33.53 + +25.9 +24.93 +25.76 +25.93 +25.87 +26.2 + +35.28 +34.71 +35.06 +34.07 +35.24 +35.49 + +27.46 +30.98 +27.47 +28.48 +28.44 +27.8 + +9.98 +9.92 +9.06 +10.89 +10.56 + +24.45 +25.28 +23.19 +25.9 +25.54 + +7.11 +7.83 +7.34 +5.42 +8.35 + +25.71 +26.28 +25.81 +26.8 +26.23 + +20.65 +19.0 +21.9 +22.23 +20.65 + +20.87 +21.85 +18.9 +20.59 +20.34 + +8.4 +8.82 +8.04 +8.28 +9.19 + +43.36 +41.84 +39.82 +45.67 +44.08 + +33.19 +33.54 +32.65 +34.24 +33.43 + +25.51 +25.57 +23.69 +26.44 +25.04 + +34.56 +34.6 +33.93 +35.81 +33.89 + +27.81 +29.17 +28.33 +29.14 +30.49 + + \ No newline at end of file diff --git a/assets/txts/pg_0284.txt b/assets/txts/pg_0284.txt new file mode 100644 index 0000000000000000000000000000000000000000..63241834db57d93325e6d5d86d6c4a710c6a6be4 --- /dev/null +++ b/assets/txts/pg_0284.txt @@ -0,0 +1,148 @@ +252 + +APPENDIX - KDD + +Table D.12. Results of different KD strategies benchmarked for ViT-B teacher with +randomly initialized (rand) ViT students applied on the RVL-CDIP dataset. + +Teacher +ViT-B_rand +– +ViT-B +ViT-B +ViT-B +ViT-B +ViT-B +ViT-B +ViT-B +ViT-B +ViT-B +ViT-B +ViT-B + +Student +– +ViT-Srand + +ViT-Trand + +Method +Baseline +Vanilla [τ = 2.5, α = 0.5] +NKD [τ = 1, γ = 1.5] +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet [middle] +Vanilla [τ = 2.5, α =] +NKD [τ = 1, γ = 1.5] +MSE +SimKD [CLS+MLP] +SimKD [CNN] +FitNet [middle] + +ACC +0.540 +0.613 +0.579 +0.626 +0.609 +0.681 +0.628 +0.560 +0.552 +0.579 +0.582 +0.663 +0.570 + +AURC +0.235 +0.175 +0.193 +0.159 +0.181 +0.181 +0.161 +0.212 +0.215 +0.198 +0.199 +0.205 +0.207 + +ECE +0.078 +0.220 +0.046 +0.203 +0.120 +0.297 +0.155 +0.141 +0.025 +0.232 +0.196 +0.316 +0.143 + +Table D.13. Results of different KD strategies benchmarked for ResNet-101 teacher +with randomly initialized (rand) ResNet-50 students applied on the RVL-CDIP +dataset. + +Teacher +R101_rand +– +R101 +R101 +R101 +R101 +R101 +R101 + +Student +– +R50 +R50rand + +Method +Baseline +Baseline +Vanilla [τ = 2.5, α = 0.5] +NKD [τ = 1, γ = 1.5] +MSE +SimKD [CLS+MLP] +SimKD [∅ projector] +FitNet [middle] + +ACC + +AURC + +ECE + +0.769 +0.760 +0.770 +0.765 +0.766 +0.774 +0.760 + +0.015 +0.017 +0.051 +0.022 +0.037 +0.025 +0.177 + +0.066 +0.071 +0.072 +0.068 +0.068 +0.063 +0.078 + + \ No newline at end of file diff --git a/assets/txts/pg_0285.txt b/assets/txts/pg_0285.txt new file mode 100644 index 0000000000000000000000000000000000000000..5767abffae86f6166b213e40c39efe1ae240c78d --- /dev/null +++ b/assets/txts/pg_0285.txt @@ -0,0 +1,17 @@ +Curriculum +JORDY VAN LANDEGHEM received an M.A. degree in Linguistics in 2015 and +an M.Sc. degree in artificial intelligence in 2017, both from KU Leuven, where +he is currently pursuing a Ph.D. degree in computer science. He completed +research internships at Oracle and Nuance Communications, and is currently +the lead AI Researcher at Contract.fit, a European SaaS start-up building +intelligent document processing solutions. +His industrial Ph.D. project entitled “Intelligent Automation for AI-Driven +Document Understanding” focuses on the fundamentals of probabilistic deep +learning, emphasizing calibration, uncertainty quantification, and out-ofdistribution robustness to obtain more reliable document intelligence systems. +Recently, he spearheaded the Document UnderstanDing of Everything (DUDE) +project and the ensuing ICDAR 2023 competition, with more research published +on reliable and scalable document understanding. + +253 + + \ No newline at end of file diff --git a/assets/txts/pg_0286.txt b/assets/txts/pg_0286.txt new file mode 100644 index 0000000000000000000000000000000000000000..8214d0ee079917c29e57d16e764fc46de8fb50bf --- /dev/null +++ b/assets/txts/pg_0286.txt @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/txts/pg_0287.txt b/assets/txts/pg_0287.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebe343634ebd0ff657f61f44c211da9ae2aadc6c --- /dev/null +++ b/assets/txts/pg_0287.txt @@ -0,0 +1,22 @@ +Publications +Journal Articles +Sumam Francis, Jordy Van Landeghem, and Marie-Francine Moens. Transfer +Learning for Named Entity Recognition in Financial and Biomedical Documents. +Information, 10(8):248, 2019 +Jordy Van Landeghem, Matthew Blaschko, Bertrand Anckaert, and MarieFrancine Moens. Benchmarking Scalable Predictive Uncertainty in Text +Classification. IEEE Access, 2022 + +Peer-reviewed International Conference and Workshop Articles +Jordy Van Landeghem, Matthew B Blaschko, Bertrand Anckaert, and MarieFrancine Moens. Predictive Uncertainty for Probabilistic Novelty Detection in +Text Classification. In ICML Workshop on Uncertainty and Robustness in Deep +Learning, 2020 +Jordy Van Landeghem, Rubèn Tito, Łukasz Borchmann, Michał Pietruszka, +Dawid Jurkiewicz, Rafał Powalski, Paweł Józiak, Sanket Biswas, Mickaël +Coustaty, and Tomasz Stanisławek. ICDAR 2023 Competition on Document +UnderstanDing of Everything (DUDE). In International Conference on +Document Analysis and Recognition, pages 420–434. Springer, 2023 *Oral +Presentation + +255 + + \ No newline at end of file diff --git a/assets/txts/pg_0288.txt b/assets/txts/pg_0288.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce967af6839379e5024fe965aec9cb27744ca0db --- /dev/null +++ b/assets/txts/pg_0288.txt @@ -0,0 +1,25 @@ +256 + +PUBLICATIONS + +Jordy Van Landeghem, Rubèn Tito, Łukasz Borchmann, Michał Pietruszka, +Pawel Joziak, Rafal Powalski, Dawid Jurkiewicz, Mickaël Coustaty, Bertrand +Anckaert, Ernest Valveny, Matthew Blaschko, Marie-Francine Moens, and +Tomasz Stanisławek. Document Understanding Dataset and Evaluation (DUDE). +In Proceedings of the IEEE/CVF International Conference on Computer Vision, +pages 19528–19540, 2023 +Jordy Van Landeghem, Sanket Biswas, Matthew Blaschko, and Marie-Francine +Moens. Beyond Document Page Classification: Design, Datasets, and Challenges. +In Proceedings of the IEEE/CVF Winter Conference on Applications of +Computer Vision, pages 2962–2972, 2024 *Oral Presentation +Jordy Van Landeghem, Subhajit Maity, Ayan Banerjee, Matthew B Blaschko, +Marie-Francine Moens, Josep Llados, and Sanket Biswas. DistilDoc: Knowledge +Distillation for Visually-Rich Document Applications. In Proceedings of the +IEEE/CVF Conference on Computer Vision and Pattern Recognition (under +review), 2024 + +Organized Competitions +ICDAR2023 Competition on Document UnderstanDing of Everything (DUDE), +ICDAR, February-May, 2023, https://rrc.cvc.uab.es/?ch=23, Main organizer. + + \ No newline at end of file diff --git a/assets/txts/pg_0289.txt b/assets/txts/pg_0289.txt new file mode 100644 index 0000000000000000000000000000000000000000..8214d0ee079917c29e57d16e764fc46de8fb50bf --- /dev/null +++ b/assets/txts/pg_0289.txt @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/txts/pg_0290.txt b/assets/txts/pg_0290.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9799ff2cee861bc09e10c34450b305239a8eb3c --- /dev/null +++ b/assets/txts/pg_0290.txt @@ -0,0 +1,9 @@ +FACULTY OF ENGINEERING SCIENCE +DEPARTMENT OF COMPUTER SCIENCE +LANGUAGE INTELLIGENCE & INFORMATION RETRIEVAL LAB +Celestijnenlaan 200A box 2402 +B-3001 Leuven +jordy.vanlandeghem@cs.kuleuven.be +https://liir.cs.kuleuven.be/ + + \ No newline at end of file