Spaces:

Flitto
/

image_cut_rect

Sleeping

App Files Files Community

HERIUN commited on Oct 14, 2024

Commit

591ba45

•

1 Parent(s): 6a07cb2

add models

Browse files

Files changed (49) hide show

models/DocScanner/LICENSE.md +54 -0
models/DocScanner/OCR_eval.py +78 -0
models/DocScanner/README.md +96 -0
models/DocScanner/__init__.py +0 -0
models/DocScanner/__pycache__/__init__.cpython-38.pyc +0 -0
models/DocScanner/__pycache__/__init__.cpython-39.pyc +0 -0
models/DocScanner/__pycache__/extractor.cpython-38.pyc +0 -0
models/DocScanner/__pycache__/extractor.cpython-39.pyc +0 -0
models/DocScanner/__pycache__/inference.cpython-38.pyc +0 -0
models/DocScanner/__pycache__/inference.cpython-39.pyc +0 -0
models/DocScanner/__pycache__/model.cpython-38.pyc +0 -0
models/DocScanner/__pycache__/model.cpython-39.pyc +0 -0
models/DocScanner/__pycache__/seg.cpython-38.pyc +0 -0
models/DocScanner/__pycache__/seg.cpython-39.pyc +0 -0
models/DocScanner/__pycache__/update.cpython-38.pyc +0 -0
models/DocScanner/__pycache__/update.cpython-39.pyc +0 -0
models/DocScanner/eval.m +64 -0
models/DocScanner/evalUnwarp.m +102 -0
models/DocScanner/extractor.py +140 -0
models/DocScanner/inference.py +65 -0
models/DocScanner/model.py +104 -0
models/DocScanner/ocr_img.txt +62 -0
models/DocScanner/requirements.txt +6 -0
models/DocScanner/seg.py +576 -0
models/DocScanner/update.py +119 -0
models/DocTr-Plus/GeoTr.py +960 -0
models/DocTr-Plus/LICENSE.md +54 -0
models/DocTr-Plus/OCR_eval.py +121 -0
models/DocTr-Plus/README.md +79 -0
models/DocTr-Plus/__init__.py +0 -0
models/DocTr-Plus/__pycache__/GeoTr.cpython-38.pyc +0 -0
models/DocTr-Plus/__pycache__/GeoTr.cpython-39.pyc +0 -0
models/DocTr-Plus/__pycache__/__init__.cpython-38.pyc +0 -0
models/DocTr-Plus/__pycache__/__init__.cpython-39.pyc +0 -0
models/DocTr-Plus/__pycache__/extractor.cpython-38.pyc +0 -0
models/DocTr-Plus/__pycache__/extractor.cpython-39.pyc +0 -0
models/DocTr-Plus/__pycache__/inference.cpython-38.pyc +0 -0
models/DocTr-Plus/__pycache__/inference.cpython-39.pyc +0 -0
models/DocTr-Plus/__pycache__/position_encoding.cpython-38.pyc +0 -0
models/DocTr-Plus/__pycache__/position_encoding.cpython-39.pyc +0 -0
models/DocTr-Plus/evalUnwarp.m +46 -0
models/DocTr-Plus/extractor.py +117 -0
models/DocTr-Plus/inference.py +51 -0
models/DocTr-Plus/position_encoding.py +125 -0
models/DocTr-Plus/pyimagesearch/__init__.py +0 -0
models/DocTr-Plus/pyimagesearch/transform.py +64 -0
models/DocTr-Plus/requirements.txt +7 -0
models/DocTr-Plus/ssimm_ldm_eval.m +36 -0
models/Document-Image-Unwarping-pytorch +1 -0

models/DocScanner/LICENSE.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# License
+Copyright © Hao Feng 2024. All Rights Reserved.
+## 1. Definitions
+1.1 "Algorithm" refers to the deep learning algorithm contained in this repository, including all associated code, documentation, and data.
+1.2 "Author" refers to Hao Feng, the creator and copyright holder of the Algorithm.
+1.3 "Non-Commercial Use" means use for academic research, personal study, or non-profit projects, without any direct or indirect commercial advantage.
+1.4 "Commercial Use" means any use intended for or directed toward commercial advantage or monetary compensation.
+## 2. Grant of Rights
+2.1 Non-Commercial Use: The Author hereby grants you a worldwide, royalty-free, non-exclusive license to use, copy, modify, and distribute the Algorithm for Non-Commercial Use, subject to the conditions in Section 3.
+2.2 Commercial Use: Any Commercial Use of the Algorithm is strictly prohibited without explicit prior written permission from the Author.
+## 3. Conditions
+3.1 For Non-Commercial Use:
+    a) Attribution: You must give appropriate credit to the Author, provide a link to this license, and indicate if changes were made.
+    b) Share-Alike: If you modify, transform, or build upon the Algorithm, you must distribute your contributions under the same license as this one.
+    c) No additional restrictions: You may not apply legal terms or technological measures that legally restrict others from doing anything this license permits.
+3.2 For Commercial Use:
+    a) Prior Contact: Before any Commercial Use, you must contact the Author at haof@mail.ustc.edu.cn and obtain explicit written permission.
+    b) Separate Agreement: Commercial Use terms will be stipulated in a separate commercial license agreement.
+## 4. Disclaimer of Warranty
+The Algorithm is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose, and non-infringement. In no event shall the Author be liable for any claim, damages, or other liability arising from, out of, or in connection with the Algorithm or the use or other dealings in the Algorithm.
+## 5. Limitation of Liability
+In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall the Author be liable to you for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this license or out of the use or inability to use the Algorithm.
+## 6. Termination
+6.1 This license and the rights granted hereunder will terminate automatically upon any breach by you of the terms of this license.
+6.2 All sections which by their nature should survive the termination of this license shall survive such termination.
+## 7. Miscellaneous
+7.1 If any provision of this license is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable.
+7.2 This license represents the complete agreement concerning the subject matter hereof.
+By using the Algorithm, you acknowledge that you have read this license, understand it, and agree to be bound by its terms and conditions. If you do not agree to the terms and conditions of this license, do not use, modify, or distribute the Algorithm.
+For permissions beyond the scope of this license, please contact the Author at haof@mail.ustc.edu.cn.

models/DocScanner/OCR_eval.py ADDED Viewed

	@@ -0,0 +1,78 @@

+def Levenshtein_Distance(str1, str2):
+    matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
+    for i in range(1, len(str1) + 1):
+        for j in range(1, len(str2) + 1):
+            if str1[i - 1] == str2[j - 1]:
+                d = 0
+            else:
+                d = 1
+            matrix[i][j] = min(
+                matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d
+            )
+    return matrix[len(str1)][len(str2)]
+def cal_cer_ed(path_ours, tail="_rec"):
+    path_gt = "./GT/"
+    N = 66
+    cer1 = []
+    cer2 = []
+    ed1 = []
+    ed2 = []
+    check = [0 for _ in range(N + 1)]
+    lis = [
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        9,
+        10,
+        21,
+        22,
+        23,
+        24,
+        27,
+        30,
+        31,
+        32,
+        36,
+        38,
+        40,
+        41,
+        44,
+        45,
+        46,
+        47,
+        48,
+        50,
+        51,
+        52,
+        53,
+    ]  # DocTr (Setting 1)
+    # lis=[1,9,10,12,19,20,21,22,23,24,30,31,32,34,35,36,37,38,39,40,44,45,46,47,49] # DewarpNet (Setting 2)
+    for i in range(1, N):
+        if i not in lis:
+            continue
+        gt = Image.open(path_gt + str(i) + ".png")
+        img1 = Image.open(path_ours + str(i) + "_1" + tail)
+        img2 = Image.open(path_ours + str(i) + "_2" + tail)
+        content_gt = pytesseract.image_to_string(gt)
+        content1 = pytesseract.image_to_string(img1)
+        content2 = pytesseract.image_to_string(img2)
+        l1 = Levenshtein_Distance(content_gt, content1)
+        l2 = Levenshtein_Distance(content_gt, content2)
+        ed1.append(l1)
+        ed2.append(l2)
+        cer1.append(l1 / len(content_gt))
+        cer2.append(l2 / len(content_gt))
+        check[i] = cer1[-1]
+    print("CER: ", (np.mean(cer1) + np.mean(cer2)) / 2.0)
+    print("ED:  ", (np.mean(ed1) + np.mean(ed2)) / 2.0)
+def evalu(path_ours, tail):
+    cal_cer_ed(path_ours, tail)

models/DocScanner/README.md ADDED Viewed

	@@ -0,0 +1,96 @@

+🔥 ***2024.4.28:*** **Good news! The code and pre-trained model of DocScanner are now released!**
+🚀 **Good news! The [online demo](https://docai.doctrp.top:20443/) for DocScanner is now live, allowing for easy image upload and correction.**
+🔥 **Good news! Our new work [DocTr++: Deep Unrestricted Document Image Rectification](https://github.com/fh2019ustc/DocTr-Plus) comes out, capable of rectifying various distorted document images in the wild.**
+🔥 **Good news! A comprehensive list of [Awesome Document Image Rectification](https://github.com/fh2019ustc/Awesome-Document-Image-Rectification) methods is available.**
+# DocScanner
+<p>
+    <a href='https://drive.google.com/file/d/1mmCUj90rHyuO1SmpLt361youh-07Y0sD/view?usp=share_link' target="_blank"><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
+    <a href='https://docai.doctrp.top:20443/' target="_blank"><img src='https://img.shields.io/badge/Online-Demo-green'></a>
+</p>
+This is a PyTorch/GPU re-implementation of the paper [DocScanner: Robust Document Image Rectification with Progressive Learning](https://drive.google.com/file/d/1mmCUj90rHyuO1SmpLt361youh-07Y0sD/view?usp=share_link).
+![image](https://user-images.githubusercontent.com/50725551/209266364-aee68a88-090d-4f21-919a-092f19570d86.png)
+## 🚀 Demo [(Link)](https://docai.doctrp.top:20443/)
+***Note***：The model version used in the demo corresponds to ***"DocScanner-L"*** as described in the paper.
+1. Upload the distorted document image to be rectified in the left box.
+2. Click the "Submit" button.
+3. The rectified image will be displayed in the right box.
+<img width="1534" alt="image" src="https://github.com/fh2019ustc/DocScanner/assets/50725551/9eca3f7d-1570-4246-a3db-0a1cf1eece2d">
+### Examples
+![image](https://user-images.githubusercontent.com/50725551/223947040-eac8389c-bed8-433d-b23b-679c926fba8f.png)
+![image](https://user-images.githubusercontent.com/50725551/223946953-3a46d6a3-4361-41ef-bb5c-f235392e1f88.png)
+## Training
+- We train the **Document Localization Module** using the [Doc3D](https://github.com/fh2019ustc/doc3D-dataset) dataset. Besides, [DTD](https://www.robots.ox.ac.uk/~vgg/data/dtd/) dataset is exploited for background data enhancement.
+- We train the **Progressive Rectification Module** using the [Doc3D](https://github.com/fh2019ustc/doc3D-dataset) dataset. Here we use the background-excluded document images for training.
+## Inference
+1. Put the [pre-trained DocScanner-L](https://drive.google.com/drive/folders/1W1_DJU8dfEh6FqDYqFQ7ypR38Z8c5r4D?usp=sharing) to `$ROOT/model_pretrained/`.
+2. Put the distorted images in `$ROOT/distorted/`.
+3. Run the script and the rectified images are saved in `$ROOT/rectified/` by default.
+    ```
+    python inference.py
+    ```
+## Evaluation
+- ***Important.*** In the [DocUNet Benchmark](https://www3.cs.stonybrook.edu/~cvl/docunet.html), the '64_1.png' and '64_2.png' distorted images are rotated by 180 degrees, which do not match the GT documents. It is ignored by most of the existing works. Before the evaluation, please make a check. Note that the performances in most of the existing work are computed with these two ***mistaken*** samples.
+- For reproducing the following quantitative performance on the ***corrected*** [DocUNet Benchmark](https://www3.cs.stonybrook.edu/~cvl/docunet.html), please use the geometric rectified images available from [Google Drive](https://drive.google.com/drive/folders/1QBe26xJwIl38sWqK2ZE9ke5nu0Mpr4dW?usp=sharing). For the ***corrected*** performance of [other methods](https://github.com/fh2019ustc/Awesome-Document-Image-Rectification), please refer to the paper [DocScanner](https://arxiv.org/pdf/2110.14968v2.pdf).
+- ***Image Metrics:***  We use the same evaluation code for MS-SSIM and LD as [DocUNet Benchmark](https://www3.cs.stonybrook.edu/~cvl/docunet.html) dataset based on Matlab 2019a. Please compare the scores according to your Matlab version. We provide our Matlab interface file at ```$ROOT/ssim_ld_eval.m```.
+- ***OCR Metrics:*** The index of 30 documents (60 images) of [DocUNet Benchmark](https://www3.cs.stonybrook.edu/~cvl/docunet.html) used for our OCR evaluation is ```$ROOT/ocr_img.txt``` (*Setting 1*). Please refer to [DewarpNet](https://github.com/cvlab-stonybrook/DewarpNet) for the index of 25 documents (50 images) of [DocUNet Benchmark](https://www3.cs.stonybrook.edu/~cvl/docunet.html) used for their OCR evaluation (*Setting 2*). We provide the OCR evaluation code at ```$ROOT/OCR_eval.py```. The version of pytesseract is 0.3.8, and the version of [Tesseract](https://digi.bib.uni-mannheim.de/tesseract/) in Windows is recent 5.0.1.20220118. Note that in different operating systems, the calculated performance has slight differences.
+- ***W_v and W_h Index:*** The layout results of [DocUNet Benchmark](https://www3.cs.stonybrook.edu/~cvl/docunet.html) is available at [Google Drive](https://drive.google.com/drive/folders/1PcfWIowjM0AVKhZrRwGChM-2VAcUwWrF?usp=sharing).
+|      Method             |    MS-SSIM   |      LD     |   Li-D   |  ED (*Setting 1*)  |       CER      |      ED (*Setting 2*)   |  CER       |  Para. (M) |
+|:-----------------------:|:------------:|:-----------:| :-------:|:----------------:|:--------------:|:---------------------:|:--------------:|:--------------:|
+|    *DocScanner-T*       |     0.5123   |     7.92    |  2.04    |   501.82         |     0.1823     |    809.46             |     0.2068     |  2.6 |
+|    *DocScanner-B*       |     0.5134   |     7.62    |  1.88    |   434.11         |     0.1652     |    671.48             |     0.1789     |  5.2 |
+|    *DocScanner-L*       |     0.5178   |     7.45    |  1.86    |   390.43         |     0.1486     |    632.34             |     0.1648     |  8.5 |
+## Citation
+Please cite the related works in your publications if it helps your research:
+```
+@inproceedings{feng2021doctr,
+  title={DocTr: Document Image Transformer for Geometric Unwarping and Illumination Correction},
+  author={Feng, Hao and Wang, Yuechen and Zhou, Wengang and Deng, Jiajun and Li, Houqiang},
+  booktitle={Proceedings of the 29th ACM International Conference on Multimedia},
+  pages={273--281},
+  year={2021}
+}
+```
+```
+@inproceedings{feng2022docgeonet,
+  title={Geometric Representation Learning for Document Image Rectification},
+  author={Feng, Hao and Zhou, Wengang and Deng, Jiajun and Wang, Yuechen and Li, Houqiang},
+  booktitle={Proceedings of the European Conference on Computer Vision},
+  year={2022}
+}
+```
+```
+@article{feng2021docscanner,
+  title={DocScanner: robust document image rectification with progressive learning},
+  author={Feng, Hao and Zhou, Wengang and Deng, Jiajun and Tian, Qi and Li, Houqiang},
+  journal={arXiv preprint arXiv:2110.14968},
+  year={2021}
+}
+```
+## Acknowledgement
+The codes are largely based on [DocUNet](https://www3.cs.stonybrook.edu/~cvl/docunet.html) and [DewarpNet](https://github.com/cvlab-stonybrook/DewarpNet). Thanks for their wonderful works.
+## Contact
+For commercial usage, please contact Professor Wengang Zhou ([zhwg@ustc.edu.cn](zhwg@ustc.edu.cn)) and Hao Feng ([haof@mail.ustc.edu.cn](haof@mail.ustc.edu.cn)).

models/DocScanner/__init__.py ADDED Viewed

File without changes

models/DocScanner/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (154 Bytes). View file

models/DocScanner/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (154 Bytes). View file

models/DocScanner/__pycache__/extractor.cpython-38.pyc ADDED Viewed

Binary file (3.88 kB). View file

models/DocScanner/__pycache__/extractor.cpython-39.pyc ADDED Viewed

Binary file (3.86 kB). View file

models/DocScanner/__pycache__/inference.cpython-38.pyc ADDED Viewed

Binary file (2.22 kB). View file

models/DocScanner/__pycache__/inference.cpython-39.pyc ADDED Viewed

Binary file (2.2 kB). View file

models/DocScanner/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (3.37 kB). View file

models/DocScanner/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (3.37 kB). View file

models/DocScanner/__pycache__/seg.cpython-38.pyc ADDED Viewed

Binary file (12 kB). View file

models/DocScanner/__pycache__/seg.cpython-39.pyc ADDED Viewed

Binary file (12.1 kB). View file

models/DocScanner/__pycache__/update.cpython-38.pyc ADDED Viewed

Binary file (4.3 kB). View file

models/DocScanner/__pycache__/update.cpython-39.pyc ADDED Viewed

Binary file (4.27 kB). View file

models/DocScanner/eval.m ADDED Viewed

	@@ -0,0 +1,64 @@

+path_rec = "xxx";  % rectified image path
+path_scan = './scan/';  % scan image path
+label_path = './layout/'; % layout result path
+tarea = 598400;
+ms1 = 0;
+ld1 = 0;
+lid1 = 0;
+ms2 = 0;
+ld2 = 0;
+lid2 = 0;
+wv = 0;
+wh = 0;
+sprintf(path_rec)
+for i=1:65
+    path_rec_1 = sprintf("%s%d%s", path_rec, i, '_1 copy_rec.png');  % rectified image path
+    path_rec_2 = sprintf("%s%d%s", path_rec, i, '_2 copy_rec.png');  % rectified image path
+    path_scan_new = sprintf("%s%d%s", path_scan, i, '.png');  % corresponding scan image path
+    bbox_i_path = sprintf("%s%d%s", label_path, i, '.txt');   % corresponding layout txt path
+    % imread and rgb2gray
+    A1 = imread(path_rec_1);
+    A2 = imread(path_rec_2);
+%    if i == 64
+%        A1 = rot90(A1,-2);
+%        A2 = rot90(A2,-2);
+%    end
+    ref = imread(path_scan_new);
+    A1 = rgb2gray(A1);
+    A2 = rgb2gray(A2);
+    ref = rgb2gray(ref);
+    bbox_i = read_txt(bbox_i_path);
+    bbox_i = bbox_i + 1; % python index starts from 0
+    % resize
+    b = sqrt(tarea/size(ref,1)/size(ref,2));
+    ref = imresize(ref,b);
+    A1 = imresize(A1,[size(ref,1),size(ref,2)]);
+    A2 = imresize(A2,[size(ref,1),size(ref,2)]);
+    scaled_bbox_i = bbox_i * b * 0.5;
+    scaled_bbox_i = round(scaled_bbox_i);
+    scaled_bbox_i = max(scaled_bbox_i, 1);
+    % calculate
+    [ms_1, ld_1, lid_1, W_v_1, W_h_1] = evalUnwarp(A1, ref, scaled_bbox_i);
+    [ms_2, ld_2, lid_2, W_v_2, W_h_2] = evalUnwarp(A2, ref, scaled_bbox_i);
+    ms1 = ms1 + ms_1;
+    ms2 = ms2 + ms_2;
+    ld1 = ld1 + ld_1;
+    ld2 = ld2 + ld_2;
+    lid1 = lid1 + lid_1;
+    lid2 = lid2 + lid_2;
+    wv = wv + W_v_1 + W_v_2;
+    wh = wh + W_h_1 + W_h_2;
+end
+ms = (ms1 + ms2) / 130  % MS-SSIM
+ld = (ld1 + ld2) / 130  % local distortion
+li_d = (lid1 + lid2) / 130  % line distortion
+wv = wv / 130  % wv index
+wh = wh / 130  % wh index

models/DocScanner/evalUnwarp.m ADDED Viewed

	@@ -0,0 +1,102 @@

+function [ms, ld, li_d, wv, wh] = evalUnwarp(A, ref, data)
+%EVALUNWARP compute MSSSIM and LD between the unwarped image and the scan
+%   A:      unwarped image
+%   ref:    reference image, the scan image
+%   ms:     returned MS-SSIM value
+%   ld:     returned local distortion value
+%   Matlab image processing toolbox is necessary to compute ssim. The weights
+%   for multi-scale ssim is directly adopted from:
+%
+%   Wang, Zhou, Eero P. Simoncelli, and Alan C. Bovik. "Multiscale structural
+%   similarity for image quality assessment." In Signals, Systems and Computers,
+%   2004. Conference Record of the Thirty-Seventh Asilomar Conference on, 2003.
+%
+%   Local distortion relies on the paper:
+%   Liu, Ce, Jenny Yuen, and Antonio Torralba. "Sift flow: Dense correspondence
+%   across scenes and its applications." In PAMI, 2010.
+%
+%   and its implementation:
+%   https://people.csail.mit.edu/celiu/SIFTflow/
+x = A;
+y = ref;
+im1=imresize(imfilter(y,fspecial('gaussian',7,1.),'same','replicate'),0.5,'bicubic');
+im2=imresize(imfilter(x,fspecial('gaussian',7,1.),'same','replicate'),0.5,'bicubic');
+im1=im2double(im1);
+im2=im2double(im2);
+cellsize=3;
+gridspacing=1;
+sift1 = mexDenseSIFT(im1,cellsize,gridspacing);
+sift2 = mexDenseSIFT(im2,cellsize,gridspacing);
+SIFTflowpara.alpha=2*255;
+SIFTflowpara.d=40*255;
+SIFTflowpara.gamma=0.005*255;
+SIFTflowpara.nlevels=4;
+SIFTflowpara.wsize=2;
+SIFTflowpara.topwsize=10;
+SIFTflowpara.nTopIterations = 60;
+SIFTflowpara.nIterations= 30;
+[vx,vy,~]=SIFTflowc2f(sift1,sift2,SIFTflowpara);
+rows1p = size(im1,1);
+cols1p = size(im1,2);
+% Li-D
+rowstd_sum = 0;
+for i = 1:rows1p
+    rowstd = std(vy(i, :),1);
+    rowstd_sum = rowstd_sum + rowstd;
+end
+rowstd_mean = rowstd_sum / rows1p;
+colstd_sum = 0;
+for i = 1:cols1p
+    colstd = std(vx(:, i),1);
+    colstd_sum = colstd_sum + colstd;
+end
+colstd_mean = colstd_sum / cols1p;
+li_d = (rowstd_mean + colstd_mean) / 2;
+% LD
+d = sqrt(vx.^2 + vy.^2);
+ld = mean(d(:));
+% MS-SSIM
+wt = [0.0448 0.2856 0.3001 0.2363 0.1333];
+ss = zeros(5, 1);
+for s = 1 : 5
+    ss(s) = ssim(x, y);
+    x = impyramid(x, 'reduce');
+    y = impyramid(y, 'reduce');
+end
+ms = wt * ss;
+% wv and wh
+rowstd_sum = 0;
+for i = 1:size(data, 1)
+    rowstd_top = std(vy(data(i,2), data(i,1):data(i,3)),1) / (data(i,3)-data(i,1));
+    rowstd_bot = std(vy(data(i,4), data(i,1):data(i,3)),1) / (data(i,3)-data(i,1));
+    rowstd_sum = rowstd_sum + rowstd_top + rowstd_bot;
+end
+wv = rowstd_sum / (2 * size(data, 1));
+colstd_sum = 0;
+for i = 1:size(data, 1)
+    colstd_left = std(vx(data(i,2):data(i,4), data(i,1)),1) / (data(i,4)- data(i,2));
+    colstd_right = std(vx(data(i,2):data(i,4), data(i,3)),1) / (data(i,4)- data(i,2));
+    colstd_sum = colstd_sum + colstd_left + colstd_right;
+end
+wh = colstd_sum / (2 * size(data, 1));
+end

models/DocScanner/extractor.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch.nn as nn
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, padding=1, stride=stride
+        )
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class BottleneckBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(BottleneckBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes // 4, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2d(
+            planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride
+        )
+        self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+        if norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes // 4)
+            self.norm2 = nn.BatchNorm2d(planes // 4)
+            self.norm3 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes // 4)
+            self.norm2 = nn.InstanceNorm2d(planes // 4)
+            self.norm3 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.InstanceNorm2d(planes)
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        y = self.relu(self.norm3(self.conv3(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class BasicEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        if self.norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(64)
+        elif self.norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(64)
+        self.conv1 = nn.Conv2d(3, 80, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 80
+        self.layer1 = self._make_layer(80, stride=1)
+        self.layer2 = self._make_layer(160, stride=2)
+        self.layer3 = self._make_layer(240, stride=2)
+        # output convolution
+        self.conv2 = nn.Conv2d(240, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+        return x

models/DocScanner/inference.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import argparse
+import os
+import warnings
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from DocScanner.model import DocScanner
+from DocScanner.seg import U2NETP
+from PIL import Image
+warnings.filterwarnings("ignore")
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.msk = U2NETP(3, 1)
+        self.bm = DocScanner()  # 矫正
+    def forward(self, x):
+        msk, _1, _2, _3, _4, _5, _6 = self.msk(x)
+        msk = (msk > 0.5).float()
+        x = msk * x
+        bm = self.bm(x, iters=12, test_mode=True)
+        bm = (2 * (bm / 286.8) - 1) * 0.99
+        return bm, msk
+def reload_seg_model(model, path=""):
+    if not bool(path):
+        return model
+    else:
+        model_dict = model.state_dict()
+        pretrained_dict = torch.load(path, map_location="cuda:0")
+        pretrained_dict = {
+            k[6:]: v for k, v in pretrained_dict.items() if k[6:] in model_dict
+        }
+        model_dict.update(pretrained_dict)
+        model.load_state_dict(model_dict)
+        return model
+def reload_rec_model(model, path=""):
+    if not bool(path):
+        return model
+    else:
+        model_dict = model.state_dict()
+        pretrained_dict = torch.load(path, map_location="cuda:0")
+        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
+        model_dict.update(pretrained_dict)
+        model.load_state_dict(model_dict)
+        return model

models/DocScanner/model.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from DocScanner.extractor import BasicEncoder
+from DocScanner.update import BasicUpdateBlock
+def bilinear_sampler(img, coords, mode="bilinear", mask=False):
+    """Wrapper for grid_sample, uses pixel coordinates"""
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+    return img
+def coords_grid(batch, ht, wd):
+    coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+class DocScanner(nn.Module):
+    def __init__(self):
+        super(DocScanner, self).__init__()
+        self.hidden_dim = hdim = 160
+        self.context_dim = 160
+        self.fnet = BasicEncoder(output_dim=320, norm_fn="instance")
+        self.update_block = BasicUpdateBlock(hidden_dim=hdim)
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+    def initialize_flow(self, img):
+        N, C, H, W = img.shape
+        coodslar = coords_grid(N, H, W).to(img.device)
+        coords0 = coords_grid(N, H // 8, W // 8).to(img.device)
+        coords1 = coords_grid(N, H // 8, W // 8).to(img.device)
+        return coodslar, coords0, coords1
+    def upsample_flow(self, flow, mask):
+        N, _, H, W = flow.shape
+        mask = mask.view(N, 1, 9, 8, 8, H, W)
+        mask = torch.softmax(mask, dim=2)
+        up_flow = F.unfold(8 * flow, [3, 3], padding=1)
+        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 2, 8 * H, 8 * W)
+    def forward(self, image1, iters=12, flow_init=None, test_mode=False):
+        image1 = image1.contiguous()
+        fmap1 = self.fnet(image1)
+        warpfea = fmap1
+        net, inp = torch.split(fmap1, [160, 160], dim=1)
+        net = torch.tanh(net)
+        inp = torch.relu(inp)
+        coodslar, coords0, coords1 = self.initialize_flow(image1)
+        if flow_init is not None:
+            coords1 = coords1 + flow_init
+        flow_predictions = []
+        for itr in range(iters):
+            coords1 = coords1.detach()
+            flow = coords1 - coords0
+            net, up_mask, delta_flow = self.update_block(net, inp, warpfea, flow)
+            coords1 = coords1 + delta_flow
+            flow_up = self.upsample_flow(coords1 - coords0, up_mask)
+            bm_up = coodslar + flow_up
+            warpfea = bilinear_sampler(fmap1, coords1.permute(0, 2, 3, 1))
+            flow_predictions.append(bm_up)
+        if test_mode:
+            return bm_up
+        return flow_predictions

models/DocScanner/ocr_img.txt ADDED Viewed

	@@ -0,0 +1,62 @@

+The images for OCR evaluation of DocUNet Benchmark.
+# Setting 1 (Setting from DocTr)
+# Total 30 * 2 = 60 images.
+./scan/1.png
+./scan/2.png
+./scan/3.png
+./scan/4.png
+./scan/5.png
+./scan/6.png
+./scan/7.png
+./scan/9.png
+./scan/10.png
+./scan/21.png
+./scan/22.png
+./scan/23.png
+./scan/24.png
+./scan/27.png
+./scan/30.png
+./scan/31.png
+./scan/32.png
+./scan/36.png
+./scan/38.png
+./scan/40.png
+./scan/41.png
+./scan/44.png
+./scan/45.png
+./scan/46.png
+./scan/47.png
+./scan/48.png
+./scan/50.png
+./scan/51.png
+./scan/52.png
+./scan/53.png
+# Setting 2 (Setting from DewarpNet)
+# Link: https://github.com/cvlab-stonybrook/DewarpNet/blob/master/eval/ocr_eval/ocr_files.txt
+# Total 25 * 2 = 50 images.
+./scan/1.png
+./scan/9.png
+./scan/10.png
+./scan/12.png
+./scan/19.png
+./scan/20.png
+./scan/21.png
+./scan/22.png
+./scan/23.png
+./scan/24.png
+./scan/30.png
+./scan/31.png
+./scan/32.png
+./scan/34.png
+./scan/35.png
+./scan/36.png
+./scan/37.png
+./scan/38.png
+./scan/39.png
+./scan/40.png
+./scan/44.png
+./scan/45.png
+./scan/46.png
+./scan/47.png
+./scan/49.png

models/DocScanner/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+numpy==1.19.0
+opencv_python==4.2.0.34
+Pillow==9.4.0
+scikit_image==0.17.2
+skimage==0.0
+torch==1.5.1+cu101

models/DocScanner/seg.py ADDED Viewed

	@@ -0,0 +1,576 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models
+class sobel_net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv_opx = nn.Conv2d(1, 1, 3, bias=False)
+        self.conv_opy = nn.Conv2d(1, 1, 3, bias=False)
+        sobel_kernelx = np.array(
+            [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype="float32"
+        ).reshape((1, 1, 3, 3))
+        sobel_kernely = np.array(
+            [[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype="float32"
+        ).reshape((1, 1, 3, 3))
+        self.conv_opx.weight.data = torch.from_numpy(sobel_kernelx)
+        self.conv_opy.weight.data = torch.from_numpy(sobel_kernely)
+        for p in self.parameters():
+            p.requires_grad = False
+    def forward(self, im):  # input rgb
+        x = (
+            0.299 * im[:, 0, :, :] + 0.587 * im[:, 1, :, :] + 0.114 * im[:, 2, :, :]
+        ).unsqueeze(
+            1
+        )  # rgb2gray
+        gradx = self.conv_opx(x)
+        grady = self.conv_opy(x)
+        x = (gradx**2 + grady**2) ** 0.5
+        x = (x - x.min()) / (x.max() - x.min())
+        x = F.pad(x, (1, 1, 1, 1))
+        x = torch.cat([im, x], dim=1)
+        return x
+class REBNCONV(nn.Module):
+    def __init__(self, in_ch=3, out_ch=3, dirate=1):
+        super(REBNCONV, self).__init__()
+        self.conv_s1 = nn.Conv2d(
+            in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate
+        )
+        self.bn_s1 = nn.BatchNorm2d(out_ch)
+        self.relu_s1 = nn.ReLU(inplace=True)
+    def forward(self, x):
+        hx = x
+        xout = self.relu_s1(self.bn_s1(self.conv_s1(hx)))
+        return xout
+## upsample tensor 'src' to have the same spatial size with tensor 'tar'
+def _upsample_like(src, tar):
+    src = F.interpolate(src, size=tar.shape[2:], mode="bilinear", align_corners=False)
+    return src
+### RSU-7 ###
+class RSU7(nn.Module):  # UNet07DRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU7, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx = self.pool5(hx5)
+        hx6 = self.rebnconv6(hx)
+        hx7 = self.rebnconv7(hx6)
+        hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1))
+        hx6dup = _upsample_like(hx6d, hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-6 ###
+class RSU6(nn.Module):  # UNet06DRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU6, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx6 = self.rebnconv6(hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-5 ###
+class RSU5(nn.Module):  # UNet05DRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU5, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx5 = self.rebnconv5(hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-4 ###
+class RSU4(nn.Module):  # UNet04DRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-4F ###
+class RSU4F(nn.Module):  # UNet04FRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4F, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx2 = self.rebnconv2(hx1)
+        hx3 = self.rebnconv3(hx2)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1))
+        hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1))
+        return hx1d + hxin
+##### U^2-Net ####
+class U2NET(nn.Module):
+    def __init__(self, in_ch=3, out_ch=1):
+        super(U2NET, self).__init__()
+        self.edge = sobel_net()
+        self.stage1 = RSU7(in_ch, 32, 64)
+        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage2 = RSU6(64, 32, 128)
+        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage3 = RSU5(128, 64, 256)
+        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage4 = RSU4(256, 128, 512)
+        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage5 = RSU4F(512, 256, 512)
+        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage6 = RSU4F(512, 256, 512)
+        # decoder
+        self.stage5d = RSU4F(1024, 256, 512)
+        self.stage4d = RSU4(1024, 128, 256)
+        self.stage3d = RSU5(512, 64, 128)
+        self.stage2d = RSU6(256, 32, 64)
+        self.stage1d = RSU7(128, 16, 64)
+        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
+        self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
+        self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.outconv = nn.Conv2d(6, out_ch, 1)
+    def forward(self, x):
+        x = self.edge(x)
+        hx = x
+        # stage 1
+        hx1 = self.stage1(hx)
+        hx = self.pool12(hx1)
+        # stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+        # stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+        # stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+        # stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+        # stage 6
+        hx6 = self.stage6(hx)
+        hx6up = _upsample_like(hx6, hx5)
+        # -------------------- decoder --------------------
+        hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
+        # side output
+        d1 = self.side1(hx1d)
+        d2 = self.side2(hx2d)
+        d2 = _upsample_like(d2, d1)
+        d3 = self.side3(hx3d)
+        d3 = _upsample_like(d3, d1)
+        d4 = self.side4(hx4d)
+        d4 = _upsample_like(d4, d1)
+        d5 = self.side5(hx5d)
+        d5 = _upsample_like(d5, d1)
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, d1)
+        d0 = self.outconv(torch.cat((d1, d2, d3, d4, d5, d6), 1))
+        return (
+            torch.sigmoid(d0),
+            torch.sigmoid(d1),
+            torch.sigmoid(d2),
+            torch.sigmoid(d3),
+            torch.sigmoid(d4),
+            torch.sigmoid(d5),
+            torch.sigmoid(d6),
+        )
+### U^2-Net small ###
+class U2NETP(nn.Module):
+    def __init__(self, in_ch=3, out_ch=1):
+        super(U2NETP, self).__init__()
+        self.stage1 = RSU7(in_ch, 16, 64)
+        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage2 = RSU6(64, 16, 64)
+        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage3 = RSU5(64, 16, 64)
+        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage4 = RSU4(64, 16, 64)
+        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage5 = RSU4F(64, 16, 64)
+        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage6 = RSU4F(64, 16, 64)
+        # decoder
+        self.stage5d = RSU4F(128, 16, 64)
+        self.stage4d = RSU4(128, 16, 64)
+        self.stage3d = RSU5(128, 16, 64)
+        self.stage2d = RSU6(128, 16, 64)
+        self.stage1d = RSU7(128, 16, 64)
+        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side3 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side4 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side5 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side6 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.outconv = nn.Conv2d(6, out_ch, 1)
+    def forward(self, x):
+        hx = x
+        # stage 1
+        hx1 = self.stage1(hx)
+        hx = self.pool12(hx1)
+        # stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+        # stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+        # stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+        # stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+        # stage 6
+        hx6 = self.stage6(hx)
+        hx6up = _upsample_like(hx6, hx5)
+        # decoder
+        hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
+        # side output
+        d1 = self.side1(hx1d)
+        d2 = self.side2(hx2d)
+        d2 = _upsample_like(d2, d1)
+        d3 = self.side3(hx3d)
+        d3 = _upsample_like(d3, d1)
+        d4 = self.side4(hx4d)
+        d4 = _upsample_like(d4, d1)
+        d5 = self.side5(hx5d)
+        d5 = _upsample_like(d5, d1)
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, d1)
+        d0 = self.outconv(torch.cat((d1, d2, d3, d4, d5, d6), 1))
+        return (
+            torch.sigmoid(d0),
+            torch.sigmoid(d1),
+            torch.sigmoid(d2),
+            torch.sigmoid(d3),
+            torch.sigmoid(d4),
+            torch.sigmoid(d5),
+            torch.sigmoid(d6),
+        )

models/DocScanner/update.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class FlowHead(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=256):
+        super(FlowHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+class ConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(ConvGRU, self).__init__()
+        self.convz = nn.Conv2d(hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convr = nn.Conv2d(hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convq = nn.Conv2d(hidden_dim + input_dim, hidden_dim, 3, padding=1)
+    def forward(self, h, x):
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+        return h
+class SepConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2)
+        )
+        self.convr1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2)
+        )
+        self.convq1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2)
+        )
+        self.convz2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0)
+        )
+        self.convr2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0)
+        )
+        self.convq2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0)
+        )
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+        return h
+class BasicMotionEncoder(nn.Module):
+    def __init__(self):
+        super(BasicMotionEncoder, self).__init__()
+        self.convc1 = nn.Conv2d(320, 240, 1, padding=0)
+        self.convc2 = nn.Conv2d(240, 160, 3, padding=1)
+        self.convf1 = nn.Conv2d(2, 160, 7, padding=3)
+        self.convf2 = nn.Conv2d(160, 80, 3, padding=1)
+        self.conv = nn.Conv2d(160 + 80, 160 - 2, 3, padding=1)
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+class BasicUpdateBlock(nn.Module):
+    def __init__(self, hidden_dim=128):
+        super(BasicUpdateBlock, self).__init__()
+        self.encoder = BasicMotionEncoder()
+        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=160 + 160)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=320)
+        self.mask = nn.Sequential(
+            nn.Conv2d(hidden_dim, 288, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(288, 64 * 9, 1, padding=0),
+        )
+    def forward(self, net, inp, corr, flow):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+        mask = 0.25 * self.mask(net)
+        return net, mask, delta_flow

models/DocTr-Plus/GeoTr.py ADDED Viewed

	@@ -0,0 +1,960 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import argparse
+import copy
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from .extractor import BasicEncoder
+from .position_encoding import build_position_encoding
+class attnLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        nhead=8,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn_list = nn.ModuleList(
+            [
+                copy.deepcopy(nn.MultiheadAttention(d_model, nhead, dropout=dropout))
+                for i in range(2)
+            ]
+        )
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2_list = nn.ModuleList(
+            [copy.deepcopy(nn.LayerNorm(d_model)) for i in range(2)]
+        )
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2_list = nn.ModuleList(
+            [copy.deepcopy(nn.Dropout(dropout)) for i in range(2)]
+        )
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(
+        self,
+        tgt,
+        memory_list,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos=None,
+        memory_pos=None,
+    ):
+        q = k = self.with_pos_embed(tgt, pos)
+        tgt2 = self.self_attn(
+            q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
+        )[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        for memory, multihead_attn, norm2, dropout2, m_pos in zip(
+            memory_list,
+            self.multihead_attn_list,
+            self.norm2_list,
+            self.dropout2_list,
+            memory_pos,
+        ):
+            tgt2 = multihead_attn(
+                query=self.with_pos_embed(tgt, pos),
+                key=self.with_pos_embed(memory, m_pos),
+                value=memory,
+                attn_mask=memory_mask,
+                key_padding_mask=memory_key_padding_mask,
+            )[0]
+            tgt = tgt + dropout2(tgt2)
+            tgt = norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward_pre(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos=None,
+        memory_pos=None,
+    ):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, pos)
+        tgt2 = self.self_attn(
+            q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
+        )[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt2, pos),
+            key=self.with_pos_embed(memory, memory_pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+    def forward(
+        self,
+        tgt,
+        memory_list,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos=None,
+        memory_pos=None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(
+                tgt,
+                memory_list,
+                tgt_mask,
+                memory_mask,
+                tgt_key_padding_mask,
+                memory_key_padding_mask,
+                pos,
+                memory_pos,
+            )
+        return self.forward_post(
+            tgt,
+            memory_list,
+            tgt_mask,
+            memory_mask,
+            tgt_key_padding_mask,
+            memory_key_padding_mask,
+            pos,
+            memory_pos,
+        )
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
+class TransDecoder(nn.Module):
+    def __init__(self, num_attn_layers, hidden_dim=128):
+        super(TransDecoder, self).__init__()
+        attn_layer = attnLayer(hidden_dim)
+        self.layers = _get_clones(attn_layer, num_attn_layers)
+        self.position_embedding = build_position_encoding(hidden_dim)
+    def forward(self, imgf, query_embed):
+        pos = self.position_embedding(
+            torch.ones(imgf.shape[0], imgf.shape[2], imgf.shape[3]).bool().cuda()
+        )  # torch.Size([1, 128, 36, 36])
+        bs, c, h, w = imgf.shape
+        imgf = imgf.flatten(2).permute(2, 0, 1)
+        # query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        pos = pos.flatten(2).permute(2, 0, 1)
+        for layer in self.layers:
+            query_embed = layer(query_embed, [imgf], pos=pos, memory_pos=[pos, pos])
+        query_embed = query_embed.permute(1, 2, 0).reshape(bs, c, h, w)
+        return query_embed
+class TransEncoder(nn.Module):
+    def __init__(self, num_attn_layers, hidden_dim=128):
+        super(TransEncoder, self).__init__()
+        attn_layer = attnLayer(hidden_dim)
+        self.layers = _get_clones(attn_layer, num_attn_layers)
+        self.position_embedding = build_position_encoding(hidden_dim)
+    def forward(self, imgf):
+        pos = self.position_embedding(
+            torch.ones(imgf.shape[0], imgf.shape[2], imgf.shape[3]).bool().cuda()
+        )  # torch.Size([1, 128, 36, 36])
+        bs, c, h, w = imgf.shape
+        imgf = imgf.flatten(2).permute(2, 0, 1)
+        pos = pos.flatten(2).permute(2, 0, 1)
+        for layer in self.layers:
+            imgf = layer(imgf, [imgf], pos=pos, memory_pos=[pos, pos])
+        imgf = imgf.permute(1, 2, 0).reshape(bs, c, h, w)
+        return imgf
+class FlowHead(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=256):
+        super(FlowHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+class UpdateBlock(nn.Module):
+    def __init__(self, hidden_dim=128):
+        super(UpdateBlock, self).__init__()
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)
+        self.mask = nn.Sequential(
+            nn.Conv2d(hidden_dim, 256, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 64 * 9, 1, padding=0),
+        )
+    def forward(self, imgf, coords1):
+        mask = 0.25 * self.mask(imgf)  # scale mask to balence gradients
+        dflow = self.flow_head(imgf)
+        coords1 = coords1 + dflow
+        return mask, coords1
+def coords_grid(batch, ht, wd):
+    coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+def upflow8(flow, mode="bilinear"):
+    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+    return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
+class OverlapPatchEmbed(nn.Module):
+    """Image to Patch Embedding"""
+    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2),
+        )
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class GeoTr(nn.Module):
+    def __init__(self):
+        super(GeoTr, self).__init__()
+        self.hidden_dim = hdim = 256
+        self.fnet = BasicEncoder(output_dim=hdim, norm_fn="instance")
+        self.encoder_block = ["encoder_block" + str(i) for i in range(3)]
+        for i in self.encoder_block:
+            self.__setattr__(i, TransEncoder(2, hidden_dim=hdim))
+        self.down_layer = ["down_layer" + str(i) for i in range(2)]
+        for i in self.down_layer:
+            self.__setattr__(i, nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1))
+        self.decoder_block = ["decoder_block" + str(i) for i in range(3)]
+        for i in self.decoder_block:
+            self.__setattr__(i, TransDecoder(2, hidden_dim=hdim))
+        self.up_layer = ["up_layer" + str(i) for i in range(2)]
+        for i in self.up_layer:
+            self.__setattr__(
+                i, nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
+            )
+        self.query_embed = nn.Embedding(81, self.hidden_dim)
+        self.update_block = UpdateBlock(self.hidden_dim)
+    def initialize_flow(self, img):
+        N, C, H, W = img.shape
+        coodslar = coords_grid(N, H, W).to(img.device)
+        coords0 = coords_grid(N, H // 8, W // 8).to(img.device)
+        coords1 = coords_grid(N, H // 8, W // 8).to(img.device)
+        return coodslar, coords0, coords1
+    def upsample_flow(self, flow, mask):
+        N, _, H, W = flow.shape
+        mask = mask.view(N, 1, 9, 8, 8, H, W)
+        mask = torch.softmax(mask, dim=2)
+        up_flow = F.unfold(8 * flow, [3, 3], padding=1)
+        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 2, 8 * H, 8 * W)
+    def forward(self, image1):
+        fmap = self.fnet(image1)
+        fmap = torch.relu(fmap)
+        # fmap = self.TransEncoder(fmap)
+        fmap1 = self.__getattr__(self.encoder_block[0])(fmap)
+        fmap1d = self.__getattr__(self.down_layer[0])(fmap1)
+        fmap2 = self.__getattr__(self.encoder_block[1])(fmap1d)
+        fmap2d = self.__getattr__(self.down_layer[1])(fmap2)
+        fmap3 = self.__getattr__(self.encoder_block[2])(fmap2d)
+        query_embed0 = self.query_embed.weight.unsqueeze(1).repeat(1, fmap3.size(0), 1)
+        fmap3d_ = self.__getattr__(self.decoder_block[0])(fmap3, query_embed0)
+        fmap3du_ = (
+            self.__getattr__(self.up_layer[0])(fmap3d_).flatten(2).permute(2, 0, 1)
+        )
+        fmap2d_ = self.__getattr__(self.decoder_block[1])(fmap2, fmap3du_)
+        fmap2du_ = (
+            self.__getattr__(self.up_layer[1])(fmap2d_).flatten(2).permute(2, 0, 1)
+        )
+        fmap_out = self.__getattr__(self.decoder_block[2])(fmap1, fmap2du_)
+        # convex upsample baesd on fmap_out
+        coodslar, coords0, coords1 = self.initialize_flow(image1)
+        coords1 = coords1.detach()
+        mask, coords1 = self.update_block(fmap_out, coords1)
+        flow_up = self.upsample_flow(coords1 - coords0, mask)
+        bm_up = coodslar + flow_up
+        return bm_up
+## upsample tensor 'src' to have the same spatial size with tensor 'tar'
+def _upsample_like(src, tar):
+    src = F.interpolate(src, size=tar.shape[2:], mode="bilinear", align_corners=False)
+    return src
+class REBNCONV(nn.Module):
+    def __init__(self, in_ch=3, out_ch=3, dirate=1):
+        super(REBNCONV, self).__init__()
+        self.conv_s1 = nn.Conv2d(
+            in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate
+        )
+        self.bn_s1 = nn.BatchNorm2d(out_ch)
+        self.relu_s1 = nn.ReLU(inplace=True)
+    def forward(self, x):
+        hx = x
+        xout = self.relu_s1(self.bn_s1(self.conv_s1(hx)))
+        return xout
+### RSU-4 ###
+class RSU4(nn.Module):  # UNet04DRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-4F ###
+class RSU4F(nn.Module):  # UNet04FRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4F, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx2 = self.rebnconv2(hx1)
+        hx3 = self.rebnconv3(hx2)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1))
+        hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1))
+        return hx1d + hxin
+class sobel_net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv_opx = nn.Conv2d(1, 1, 3, bias=False)
+        self.conv_opy = nn.Conv2d(1, 1, 3, bias=False)
+        sobel_kernelx = np.array(
+            [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype="float32"
+        ).reshape((1, 1, 3, 3))
+        sobel_kernely = np.array(
+            [[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype="float32"
+        ).reshape((1, 1, 3, 3))
+        self.conv_opx.weight.data = torch.from_numpy(sobel_kernelx)
+        self.conv_opy.weight.data = torch.from_numpy(sobel_kernely)
+        for p in self.parameters():
+            p.requires_grad = False
+    def forward(self, im):  # input rgb
+        x = (
+            0.299 * im[:, 0, :, :] + 0.587 * im[:, 1, :, :] + 0.114 * im[:, 2, :, :]
+        ).unsqueeze(
+            1
+        )  # rgb2gray
+        gradx = self.conv_opx(x)
+        grady = self.conv_opy(x)
+        x = (gradx**2 + grady**2) ** 0.5
+        x = (x - x.min()) / (x.max() - x.min())
+        x = F.pad(x, (1, 1, 1, 1))
+        x = torch.cat([im, x], dim=1)
+        return x
+##### U^2-Net ####
+class U2NET(nn.Module):
+    def __init__(self, in_ch=3, out_ch=1):
+        super(U2NET, self).__init__()
+        self.edge = sobel_net()
+        self.stage1 = RSU7(in_ch, 32, 64)
+        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage2 = RSU6(64, 32, 128)
+        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage3 = RSU5(128, 64, 256)
+        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage4 = RSU4(256, 128, 512)
+        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage5 = RSU4F(512, 256, 512)
+        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage6 = RSU4F(512, 256, 512)
+        # decoder
+        self.stage5d = RSU4F(1024, 256, 512)
+        self.stage4d = RSU4(1024, 128, 256)
+        self.stage3d = RSU5(512, 64, 128)
+        self.stage2d = RSU6(256, 32, 64)
+        self.stage1d = RSU7(128, 16, 64)
+        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
+        self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
+        self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.outconv = nn.Conv2d(6, out_ch, 1)
+    def forward(self, x):
+        x = self.edge(x)
+        hx = x
+        # stage 1
+        hx1 = self.stage1(hx)
+        hx = self.pool12(hx1)
+        # stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+        # stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+        # stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+        # stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+        # stage 6
+        hx6 = self.stage6(hx)
+        hx6up = _upsample_like(hx6, hx5)
+        # -------------------- decoder --------------------
+        hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
+        # side output
+        d1 = self.side1(hx1d)
+        d2 = self.side2(hx2d)
+        d2 = _upsample_like(d2, d1)
+        d3 = self.side3(hx3d)
+        d3 = _upsample_like(d3, d1)
+        d4 = self.side4(hx4d)
+        d4 = _upsample_like(d4, d1)
+        d5 = self.side5(hx5d)
+        d5 = _upsample_like(d5, d1)
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, d1)
+        d0 = self.outconv(torch.cat((d1, d2, d3, d4, d5, d6), 1))
+        return (
+            torch.sigmoid(d0),
+            torch.sigmoid(d1),
+            torch.sigmoid(d2),
+            torch.sigmoid(d3),
+            torch.sigmoid(d4),
+            torch.sigmoid(d5),
+            torch.sigmoid(d6),
+        )
+### RSU-5 ###
+class RSU5(nn.Module):  # UNet05DRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU5, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx5 = self.rebnconv5(hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-6 ###
+class RSU6(nn.Module):  # UNet06DRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU6, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx6 = self.rebnconv6(hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-7 ###
+class RSU7(nn.Module):  # UNet07DRES(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU7, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx = self.pool5(hx5)
+        hx6 = self.rebnconv6(hx)
+        hx7 = self.rebnconv7(hx6)
+        hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1))
+        hx6dup = _upsample_like(hx6d, hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+class U2NETP(nn.Module):
+    def __init__(self, in_ch=3, out_ch=1):
+        super(U2NETP, self).__init__()
+        self.stage1 = RSU7(in_ch, 16, 64)
+        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage2 = RSU6(64, 16, 64)
+        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage3 = RSU5(64, 16, 64)
+        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage4 = RSU4(64, 16, 64)
+        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage5 = RSU4F(64, 16, 64)
+        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage6 = RSU4F(64, 16, 64)
+        # decoder
+        self.stage5d = RSU4F(128, 16, 64)
+        self.stage4d = RSU4(128, 16, 64)
+        self.stage3d = RSU5(128, 16, 64)
+        self.stage2d = RSU6(128, 16, 64)
+        self.stage1d = RSU7(128, 16, 64)
+        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side3 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side4 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side5 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side6 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.outconv = nn.Conv2d(6, out_ch, 1)
+    def forward(self, x):
+        hx = x
+        # stage 1
+        hx1 = self.stage1(hx)
+        hx = self.pool12(hx1)
+        # stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+        # stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+        # stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+        # stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+        # stage 6
+        hx6 = self.stage6(hx)
+        hx6up = _upsample_like(hx6, hx5)
+        # decoder
+        hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
+        # side output
+        d1 = self.side1(hx1d)
+        d2 = self.side2(hx2d)
+        d2 = _upsample_like(d2, d1)
+        d3 = self.side3(hx3d)
+        d3 = _upsample_like(d3, d1)
+        d4 = self.side4(hx4d)
+        d4 = _upsample_like(d4, d1)
+        d5 = self.side5(hx5d)
+        d5 = _upsample_like(d5, d1)
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, d1)
+        d0 = self.outconv(torch.cat((d1, d2, d3, d4, d5, d6), 1))
+        return (
+            torch.sigmoid(d0),
+            torch.sigmoid(d1),
+            torch.sigmoid(d2),
+            torch.sigmoid(d3),
+            torch.sigmoid(d4),
+            torch.sigmoid(d5),
+            torch.sigmoid(d6),
+        )

models/DocTr-Plus/LICENSE.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# License
+Copyright © Hao Feng 2024. All Rights Reserved.
+## 1. Definitions
+1.1 "Algorithm" refers to the deep learning algorithm contained in this repository, including all associated code, documentation, and data.
+1.2 "Author" refers to Hao Feng, the creator and copyright holder of the Algorithm.
+1.3 "Non-Commercial Use" means use for academic research, personal study, or non-profit projects, without any direct or indirect commercial advantage.
+1.4 "Commercial Use" means any use intended for or directed toward commercial advantage or monetary compensation.
+## 2. Grant of Rights
+2.1 Non-Commercial Use: The Author hereby grants you a worldwide, royalty-free, non-exclusive license to use, copy, modify, and distribute the Algorithm for Non-Commercial Use, subject to the conditions in Section 3.
+2.2 Commercial Use: Any Commercial Use of the Algorithm is strictly prohibited without explicit prior written permission from the Author.
+## 3. Conditions
+3.1 For Non-Commercial Use:
+    a) Attribution: You must give appropriate credit to the Author, provide a link to this license, and indicate if changes were made.
+    b) Share-Alike: If you modify, transform, or build upon the Algorithm, you must distribute your contributions under the same license as this one.
+    c) No additional restrictions: You may not apply legal terms or technological measures that legally restrict others from doing anything this license permits.
+3.2 For Commercial Use:
+    a) Prior Contact: Before any Commercial Use, you must contact the Author at haof@mail.ustc.edu.cn and obtain explicit written permission.
+    b) Separate Agreement: Commercial Use terms will be stipulated in a separate commercial license agreement.
+## 4. Disclaimer of Warranty
+The Algorithm is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose, and non-infringement. In no event shall the Author be liable for any claim, damages, or other liability arising from, out of, or in connection with the Algorithm or the use or other dealings in the Algorithm.
+## 5. Limitation of Liability
+In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall the Author be liable to you for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this license or out of the use or inability to use the Algorithm.
+## 6. Termination
+6.1 This license and the rights granted hereunder will terminate automatically upon any breach by you of the terms of this license.
+6.2 All sections which by their nature should survive the termination of this license shall survive such termination.
+## 7. Miscellaneous
+7.1 If any provision of this license is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable.
+7.2 This license represents the complete agreement concerning the subject matter hereof.
+By using the Algorithm, you acknowledge that you have read this license, understand it, and agree to be bound by its terms and conditions. If you do not agree to the terms and conditions of this license, do not use, modify, or distribute the Algorithm.
+For permissions beyond the scope of this license, please contact the Author at haof@mail.ustc.edu.cn.

models/DocTr-Plus/OCR_eval.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import time
+import numpy as np
+import pytesseract
+from PIL import Image
+pytesseract.get_tesseract_version()
+def Levenshtein_Distance(str1, str2):
+    matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
+    for i in range(1, len(str1) + 1):
+        for j in range(1, len(str2) + 1):
+            if str1[i - 1] == str2[j - 1]:
+                d = 0
+            else:
+                d = 1
+            matrix[i][j] = min(
+                matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d
+            )
+    return matrix[len(str1)][len(str2)]
+def cal_cer_ed(path_ours, tail="_rec"):
+    print(path_ours, "start")
+    print(f"started at {time.strftime('%H:%M:%S')}")
+    path_gt = "./scan/"
+    N = 196
+    cer1 = []
+    ed1 = []
+    check = [0 for _ in range(N + 1)]
+    # img index in UDIR test set for OCR evaluation
+    lis = [
+        2,
+        5,
+        17,
+        19,
+        20,
+        23,
+        31,
+        37,
+        38,
+        39,
+        40,
+        41,
+        43,
+        45,
+        47,
+        48,
+        51,
+        54,
+        57,
+        60,
+        61,
+        62,
+        64,
+        65,
+        67,
+        68,
+        70,
+        75,
+        76,
+        77,
+        78,
+        80,
+        81,
+        83,
+        84,
+        85,
+        87,
+        88,
+        90,
+        91,
+        93,
+        96,
+        99,
+        100,
+        101,
+        102,
+        103,
+        104,
+        105,
+        134,
+        137,
+        138,
+        140,
+        150,
+        151,
+        155,
+        158,
+        162,
+        163,
+        164,
+        165,
+        166,
+        169,
+        170,
+        172,
+        173,
+        175,
+        177,
+        178,
+        182,
+    ]
+    for i in range(1, N):
+        if i not in lis:
+            continue
+        gt = Image.open(path_gt + str(i) + ".png")
+        img1 = Image.open(path_ours + str(i) + tail)
+        content_gt = pytesseract.image_to_string(gt)
+        content1 = pytesseract.image_to_string(img1)
+        l1 = Levenshtein_Distance(content_gt, content1)
+        ed1.append(l1)
+        cer1.append(l1 / len(content_gt))
+        check[i] = cer1[-1]
+    CER = np.mean(cer1)
+    ED = np.mean(ed1)
+    print(f"finished at {time.strftime('%H:%M:%S')}")
+    return [path_ours, CER, ED]

models/DocTr-Plus/README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+🔥 **Good news! Our demo has already exceeded 20,000 calls!**
+🔥 **Good news! Our work has been accepted by IEEE Transactions on Multimedia.**
+🚀 **Exciting update! We have created a demo for our paper, showcasing the generic rectification capabilities of our method. [Check it out here!](https://doctrp.docscanner.top/)**
+🔥 **Good news! Our new work exhibits state-of-the-art performances on the [DocUNet Benchmark](https://www3.cs.stonybrook.edu/~cvl/docunet.html) dataset:
+[DocScanner: Robust Document Image Rectification with Progressive Learning](https://drive.google.com/file/d/1mmCUj90rHyuO1SmpLt361youh-07Y0sD/view?usp=share_link)** with [Repo](https://github.com/fh2019ustc/DocScanner).
+🔥 **Good news! A comprehensive list of [Awesome Document Image Rectification](https://github.com/fh2019ustc/Awesome-Document-Image-Rectification) methods is available.**
+# DocTr++
+<p>
+    <a href='https://project.doctrp.top/' target="_blank"><img src='https://img.shields.io/badge/Project-Page-Green'></a>
+    <a href='https://arxiv.org/abs/2304.08796' target="_blank"><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
+    <a href='https://demo.doctrp.top/' target="_blank"><img src='https://img.shields.io/badge/Online-Demo-green'></a>
+</p>
+![Demo](assets/github_demo.png)
+![Demo](assets/github_demo_v2.png)
+> **[DocTr++: Deep Unrestricted Document Image Rectification](https://arxiv.org/abs/2304.08796)**
+> DocTr++ is an enhanced version of the original [DocTr: Document Image Transformer for Geometric Unwarping and Illumination Correction](https://github.com/fh2019ustc/DocTr), aiming to rectify various distorted document images in the wild,
+whether or not the document is fully present in the image.
+Any questions or discussions are welcomed!
+## 🚀 Demo [(Link)](https://demo.doctrp.top/)
+1. Upload the distorted document image to be rectified in the left box.
+2. Click the "Submit" button.
+3. The rectified image will be displayed in the right box.
+4. Our demo environment is based on a CPU infrastructure, and due to image transmission over the network, some display latency may be experienced.
+[![Alt text](https://user-images.githubusercontent.com/50725551/232952015-15508ad6-e38c-475b-bf9e-91cb74bc5fea.png)](https://demo.doctrp.top/)
+## Inference
+1. Put the pretrained model to `$ROOT/model_pretrained/`.
+2. Put the distorted images in `$ROOT/distorted/`.
+3. Run the script and the rectified images are saved in `$ROOT/rectified/` by default.
+    ```
+    python inference.py
+    ```
+## Evaluation
+- ***Image Metrics:***  We propose the metrics MS-SSIM-M and LD-M, different from that for [DocUNet Benchmark](https://www3.cs.stonybrook.edu/~cvl/docunet.html) dataset. We use Matlab 2019a. Please compare the scores according to your Matlab version. We provide our Matlab interface file at ```$ROOT/ssim_ld_eval.m```.
+- ***OCR Metrics:*** The index of 70 document (70 images) in [UDIR test set](https://drive.google.com/drive/folders/15rknyt7XE2k6jrxaTc_n5dzXIdCukJLh?usp=share_link) used for our OCR evaluation is provided in ```$ROOT/ocr_eval.py```.
+The version of pytesseract is 0.3.8, and the version of [Tesseract](https://digi.bib.uni-mannheim.de/tesseract/) in Windows is recent 5.0.1.20220118.
+Note that in different operating systems, the calculated performance has slight differences.
+## Citation
+If you find this code useful for your research, please use the following BibTeX entry.
+```
+@inproceedings{feng2021doctr,
+  title={DocTr: Document Image Transformer for Geometric Unwarping and Illumination Correction},
+  author={Feng, Hao and Wang, Yuechen and Zhou, Wengang and Deng, Jiajun and Li, Houqiang},
+  booktitle={Proceedings of the 29th ACM International Conference on Multimedia},
+  pages={273--281},
+  year={2021}
+}
+```
+```
+@article{feng2023doctrp,
+  title={Deep Unrestricted Document Image Rectification},
+  author={Feng, Hao and Liu, Shaokai and Deng, Jiajun and Zhou, Wengang and Li, Houqiang},
+  journal={IEEE Transactions on Multimedia},
+  year={2023}
+}
+```
+## Contact
+For commercial usage, please contact Professor Wengang Zhou ([zhwg@ustc.edu.cn](zhwg@ustc.edu.cn)) and Hao Feng ([haof@mail.ustc.edu.cn](haof@mail.ustc.edu.cn)).

models/DocTr-Plus/__init__.py ADDED Viewed

File without changes

models/DocTr-Plus/__pycache__/GeoTr.cpython-38.pyc ADDED Viewed

Binary file (23.5 kB). View file

models/DocTr-Plus/__pycache__/GeoTr.cpython-39.pyc ADDED Viewed

Binary file (23.5 kB). View file

models/DocTr-Plus/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (154 Bytes). View file

models/DocTr-Plus/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (154 Bytes). View file

models/DocTr-Plus/__pycache__/extractor.cpython-38.pyc ADDED Viewed

Binary file (3.13 kB). View file

models/DocTr-Plus/__pycache__/extractor.cpython-39.pyc ADDED Viewed

Binary file (3.12 kB). View file

models/DocTr-Plus/__pycache__/inference.cpython-38.pyc ADDED Viewed

Binary file (1.61 kB). View file

models/DocTr-Plus/__pycache__/inference.cpython-39.pyc ADDED Viewed

Binary file (1.61 kB). View file

models/DocTr-Plus/__pycache__/position_encoding.cpython-38.pyc ADDED Viewed

Binary file (4.38 kB). View file

models/DocTr-Plus/__pycache__/position_encoding.cpython-39.pyc ADDED Viewed

Binary file (4.34 kB). View file

models/DocTr-Plus/evalUnwarp.m ADDED Viewed

	@@ -0,0 +1,46 @@

+function [ms, ld] = evalUnwarp(A, ref, ref_msk)
+x = A;
+y = ref;
+z = ref_msk;
+im1=imresize(imfilter(x,fspecial('gaussian',7,1.),'same','replicate'),0.5,'bicubic');
+im2=imresize(imfilter(y,fspecial('gaussian',7,1.),'same','replicate'),0.5,'bicubic');
+im3=imresize(imfilter(z,fspecial('gaussian',7,1.),'same','replicate'),0.5,'bicubic');
+im1=im2double(im1);
+im2=im2double(im2);
+im3=im2double(im3);
+cellsize=3;
+gridspacing=1;
+sift1 = mexDenseSIFT(im1,cellsize,gridspacing);
+sift2 = mexDenseSIFT(im2,cellsize,gridspacing);
+SIFTflowpara.alpha=2*255;
+SIFTflowpara.d=40*255;
+SIFTflowpara.gamma=0.005*255;
+SIFTflowpara.nlevels=4;
+SIFTflowpara.wsize=2;
+SIFTflowpara.topwsize=10;
+SIFTflowpara.nTopIterations = 60;
+SIFTflowpara.nIterations= 30;
+[vx,vy,~]=SIFTflowc2f(sift1,sift2,SIFTflowpara);
+d = sqrt(vx.^2 + vy.^2);
+mskk = (im3==0);
+ld = mean(d(~mskk));
+wt = [0.0448 0.2856 0.3001 0.2363 0.1333];
+ss = zeros(5, 1);
+for s = 1 : 5
+    ss(s) = ssim(x, z);
+    x = impyramid(x, 'reduce');
+    z = impyramid(z, 'reduce');
+end
+ms = wt * ss;
+end

models/DocTr-Plus/extractor.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, padding=1, stride=stride
+        )
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class BasicEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn="batch"):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        if self.norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+        elif self.norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(64)
+        elif self.norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(64)
+        elif self.norm_fn == "none":
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(128, stride=2)
+        self.layer3 = self._make_layer(192, stride=2)
+        # output convolution
+        self.conv2 = nn.Conv2d(192, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+        return x

models/DocTr-Plus/inference.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import argparse
+import glob
+import os
+import warnings
+import cv2
+import numpy as np
+import skimage.io as io
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from .GeoTr import U2NETP, GeoTr
+warnings.filterwarnings("ignore")
+class GeoTrP(nn.Module):
+    def __init__(self):
+        super(GeoTrP, self).__init__()
+        self.GeoTr = GeoTr()
+    def forward(self, x):
+        bm = self.GeoTr(x)  # [0]
+        bm = 2 * (bm / 288) - 1
+        bm = (bm + 1) / 2 * 2560
+        bm = F.interpolate(bm, size=(2560, 2560), mode="bilinear", align_corners=True)
+        return bm
+def reload_model(model, path=""):
+    if not bool(path):
+        return model
+    else:
+        model_dict = model.state_dict()
+        pretrained_dict = torch.load(path, map_location="cuda:0")
+        print(len(pretrained_dict.keys()))
+        print(len(pretrained_dict.keys()))
+        model_dict.update(pretrained_dict)
+        model.load_state_dict(model_dict)
+        return model

models/DocTr-Plus/position_encoding.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Various positional encodings for the transformer.
+"""
+import math
+from typing import List, Optional
+import torch
+from torch import Tensor, nn
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+    def decompose(self):
+        return self.tensors, self.mask
+    def __repr__(self):
+        return str(self.tensors)
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(
+        self, num_pos_feats=64, temperature=10000, normalize=False, scale=None
+    ):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, mask):
+        assert mask is not None
+        y_embed = mask.cumsum(1, dtype=torch.float32)
+        x_embed = mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32).cuda()
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        # print(pos.shape)
+        return pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = (
+            torch.cat(
+                [
+                    x_emb.unsqueeze(0).repeat(h, 1, 1),
+                    y_emb.unsqueeze(1).repeat(1, w, 1),
+                ],
+                dim=-1,
+            )
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .repeat(x.shape[0], 1, 1, 1)
+        )
+        return pos
+def build_position_encoding(hidden_dim=512, position_embedding="sine"):
+    N_steps = hidden_dim // 2
+    if position_embedding in ("v2", "sine"):
+        position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
+    elif position_embedding in ("v3", "learned"):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {position_embedding}")
+    return position_embedding

models/DocTr-Plus/pyimagesearch/__init__.py ADDED Viewed

File without changes

models/DocTr-Plus/pyimagesearch/transform.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import cv2
+import numpy as np
+def order_points(pts):
+    # initialize a list of coordinates that will be ordered
+    # such that the first entry in the list is the top-left,
+    # the second entry is the top-right, the third is the
+    # bottom-right, and the fourth is the bottom-left
+    rect = np.zeros((4, 2), dtype="float32")
+    # the top-left point will have the smallest sum, whereas
+    # the bottom-right point will have the largest sum
+    s = pts.sum(axis=1)
+    rect[0] = pts[np.argmin(s)]
+    rect[2] = pts[np.argmax(s)]
+    # now, compute the difference between the points, the
+    # top-right point will have the smallest difference,
+    # whereas the bottom-left will have the largest difference
+    diff = np.diff(pts, axis=1)
+    rect[1] = pts[np.argmin(diff)]
+    rect[3] = pts[np.argmax(diff)]
+    # return the ordered coordinates
+    return rect
+def four_point_transform(image, pts):
+    # obtain a consistent order of the points and unpack them
+    # individually
+    rect = order_points(pts)
+    (tl, tr, br, bl) = rect
+    # compute the width of the new image, which will be the
+    # maximum distance between bottom-right and bottom-left
+    # x-coordiates or the top-right and top-left x-coordinates
+    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
+    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
+    maxWidth = max(int(widthA), int(widthB))
+    # compute the height of the new image, which will be the
+    # maximum distance between the top-right and bottom-right
+    # y-coordinates or the top-left and bottom-left y-coordinates
+    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
+    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
+    maxHeight = max(int(heightA), int(heightB))
+    # now that we have the dimensions of the new image, construct
+    # the set of destination points to obtain a "birds eye view",
+    # (i.e. top-down view) of the image, again specifying points
+    # in the top-left, top-right, bottom-right, and bottom-left
+    # order
+    dst = np.array(
+        [[0, 0], [maxWidth - 1, 0], [maxWidth - 1, maxHeight - 1], [0, maxHeight - 1]],
+        dtype="float32",
+    )
+    # compute the perspective transform matrix and then apply it
+    M = cv2.getPerspectiveTransform(rect, dst)
+    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
+    # return the warped image
+    return warped

models/DocTr-Plus/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+numpy==1.19.0
+opencv_python==4.2.0.34
+Pillow==9.4.0
+scikit_image==0.17.2
+skimage==0.0
+thop==0.1.1.post2209072238
+torch==1.5.1+cu101

models/DocTr-Plus/ssimm_ldm_eval.m ADDED Viewed

	@@ -0,0 +1,36 @@

+path_rec = 'xxx';  % rectified image path
+path_scan = './UDIR/gt/';  % scan image path
+tarea=598400;
+ms=0;
+ld=0;
+for i=1:195
+    path_rec_1 = sprintf("%s%d%s", path_rec, i, '.png');  % rectified image path
+    path_scan_new = sprintf("%s%d%s", path_scan, i, '.png');  % corresponding scan image path
+    % imread and rgb2gray
+    A1 = imread(path_rec_1);
+    ref = imread(path_scan_new);
+    A1 = rgb2gray(A1);
+    ref = rgb2gray(ref);
+    % resize
+    b = sqrt(tarea/size(ref,1)/size(ref,2));
+    ref = imresize(ref,b);
+    ref_msk = ref;
+    A1 = imresize(A1,[size(ref,1),size(ref,2)]);
+    % mask the gt image
+    m1 = A1 == 0;
+    ref_msk(m1) = 0;
+    % calculate
+    [ms_1,ld_1] = evalUnwarp(A1, ref, ref_msk);
+    ms = ms + ms_1;
+    ld = ld + ld_1;
+end
+ms_m = ms / 195
+ld_m = ld / 195

models/Document-Image-Unwarping-pytorch ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 92b29172b981d132f7b31e767505524f8cc7af7a