Spaces:

deepdoctection
/

deepdoctection

Runtime error

App Files Files Community

JaMe76 commited on Apr 13, 2023

Commit

397d15f

•

1 Parent(s): 04c7117

update space

Browse files

Files changed (2) hide show

app.py +120 -83
conf_dd_one.yaml +57 -18

app.py CHANGED Viewed

@@ -1,27 +1,33 @@
 import os
 os.system('pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html')
 # work around: https://discuss.huggingface.co/t/how-to-install-a-specific-version-of-gradio-in-spaces/13552
 os.system("pip uninstall -y gradio")
 os.system("pip install gradio==3.4.1")
 from os import getcwd, path, environ
 import deepdoctection as dd
 from deepdoctection.dataflow.serialize import DataFromList
 import gradio as gr
 _DD_ONE = "conf_dd_one.yaml"
-_DETECTIONS = ["table", "ocr"]
-dd.ModelCatalog.register("layout/model_final_inf_only.pt",dd.ModelProfile(
-            name="layout/model_final_inf_only.pt",
-            description="Detectron2 layout detection model trained on private datasets",
-            config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
             size=[274632215],
             tp_model=False,
-            hf_repo_id=environ.get("HF_REPO"),
             hf_model_name="model_final_inf_only.pt",
             hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
             categories={"1": dd.LayoutType.text,
@@ -29,6 +35,33 @@ dd.ModelCatalog.register("layout/model_final_inf_only.pt",dd.ModelProfile(
                         "3": dd.LayoutType.list,
                         "4": dd.LayoutType.table,
                         "5": dd.LayoutType.figure},
         ))
 # Set up of the configuration and logging. Models are globally defined, so that they are not re-loaded once the input
@@ -60,26 +93,30 @@ categories_item = dd.ModelCatalog.get_profile(cfg.WEIGHTS.D2ITEM).categories
 assert categories_item is not None
 d_item = dd.D2FrcnnDetector(item_config_path, item_weights_path, categories_item, device=cfg.DEVICE)
-# word detector
-det = dd.DoctrTextlineDetector()
-# text recognizer
-rec = dd.DoctrTextRecognizer()
-def build_gradio_analyzer(table, table_ref, ocr):
     """Building the Detectron2/DocTr analyzer based on the given config"""
     cfg.freeze(freezed=False)
-    cfg.TAB = table
-    cfg.TAB_REF = table_ref
-    cfg.OCR = ocr
     cfg.freeze()
     pipe_component_list = []
     layout = dd.ImageLayoutService(d_layout, to_image=True, crop_image=True)
     pipe_component_list.append(layout)
     if cfg.TAB:
         detect_result_generator = dd.DetectResultGenerator(categories_cell)
@@ -92,15 +129,12 @@ def build_gradio_analyzer(table, table_ref, ocr):
         table_segmentation = dd.TableSegmentationService(
             cfg.SEGMENTATION.ASSIGNMENT_RULE,
-            cfg.SEGMENTATION.IOU_THRESHOLD_ROWS
-            if cfg.SEGMENTATION.ASSIGNMENT_RULE in ["iou"]
-            else cfg.SEGMENTATION.IOA_THRESHOLD_ROWS,
-            cfg.SEGMENTATION.IOU_THRESHOLD_COLS
-            if cfg.SEGMENTATION.ASSIGNMENT_RULE in ["iou"]
-            else cfg.SEGMENTATION.IOA_THRESHOLD_COLS,
             cfg.SEGMENTATION.FULL_TABLE_TILING,
             cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
             cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
         )
         pipe_component_list.append(table_segmentation)
@@ -109,69 +143,43 @@ def build_gradio_analyzer(table, table_ref, ocr):
             pipe_component_list.append(table_segmentation_refinement)
     if cfg.OCR:
-        d_layout_text = dd.ImageLayoutService(det, to_image=True, crop_image=True)
-        pipe_component_list.append(d_layout_text)
-        d_text = dd.TextExtractionService(rec, extract_from_roi="WORD")
         pipe_component_list.append(d_text)
-        match = dd.MatchingService(
             parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
-            child_categories=dd.LayoutType.word,
             matching_rule=cfg.WORD_MATCHING.RULE,
-            threshold=cfg.WORD_MATCHING.IOU_THRESHOLD
-            if cfg.WORD_MATCHING.RULE in ["iou"]
-            else cfg.WORD_MATCHING.IOA_THRESHOLD,
         )
-        pipe_component_list.append(match)
         order = dd.TextOrderService(
-            text_container=dd.LayoutType.word,
-            floating_text_block_names=[dd.LayoutType.title, dd.LayoutType.text, dd.LayoutType.list],
-            text_block_names=[
-                dd.LayoutType.title,
-                dd.LayoutType.text,
-                dd.LayoutType.list,
-                dd.LayoutType.cell,
-                dd.CellType.header,
-                dd.CellType.body,
-            ],
         )
         pipe_component_list.append(order)
     pipe = dd.DoctectionPipe(pipeline_component_list=pipe_component_list)
-    return pipe
-def prepare_output(dp, add_table, add_ocr):
-    out = dp.as_dict()
-    out.pop("_image")
-    layout_items = dp.layouts
-    if add_ocr:
-        layout_items.sort(key=lambda x: x.reading_order)
-    layout_items_str = ""
-    for item in layout_items:
-        layout_items_str += f"\n {item.category_name}: {item.text}"
-    if add_table:
-        html_list = [table.html for table in dp.tables]
-        if html_list:
-            html = ("\n").join(html_list)
-        else:
-            html = None
-    else:
-        html = None
-    return dp.viz(show_table_structure=False), layout_items_str, html, out
-def analyze_image(img, pdf, attributes):
     # creating an image object and passing to the analyzer by using dataflows
-    add_table = _DETECTIONS[0] in attributes
-    add_ocr = _DETECTIONS[1] in attributes
-    analyzer = build_gradio_analyzer(add_table, add_table, add_ocr)
     if img is not None:
         image = dd.Image(file_name="input.png", location="")
@@ -180,20 +188,39 @@ def analyze_image(img, pdf, attributes):
         df = DataFromList(lst=[image])
         df = analyzer.analyze(dataset_dataflow=df)
     elif pdf:
-        df = analyzer.analyze(path=pdf.name, max_datapoints=3)
     else:
         raise ValueError
     df.reset_state()
-    df_iter = iter(df)
-    dp = next(df_iter)
-    return prepare_output(dp, add_table, add_ocr)
 demo = gr.Blocks(css="scrollbar.css")
 with demo:
     with gr.Box():
         gr.Markdown("<h1><center>deepdoctection - A Document AI Package</center></h1>")
@@ -201,8 +228,11 @@ with demo:
                     " and document layout analysis tasks using deep learning models. It does not implement models"
                     " but enables you to build pipelines using highly acknowledged libraries for object detection,"
                     " OCR and selected NLP tasks and provides an integrated frameworks for fine-tuning, evaluating"
-                    " and running models.\n This pipeline consists of a stack of models powered by <strong>Detectron2"
-                    "</strong> for layout analysis and table recognition and <strong>DocTr</strong> for OCR.")
     with gr.Box():
         gr.Markdown("<h2><center>Upload a document and choose setting</center></h2>")
         with gr.Row():
@@ -221,8 +251,9 @@ with demo:
                 gr.Examples(examples=[path.join(getcwd(), "sample_3.pdf")], inputs = inputs_pdf)
         with gr.Row():
-            tok_input = gr.CheckboxGroup(
-                _DETECTIONS, value=_DETECTIONS, label="Additional extractions", interactive=True)
         with gr.Row():
             btn = gr.Button("Run model", variant="primary")
@@ -233,17 +264,23 @@ with demo:
                 with gr.Box():
                     gr.Markdown("<center><strong>Contiguous text</strong></center>")
                     image_text = gr.Textbox()
-                with gr.Box():
-                    gr.Markdown("<center><strong>Table</strong></center>")
-                    html = gr.HTML()
-                with gr.Box():
-                    gr.Markdown("<center><strong>JSON</strong></center>")
-                    json = gr.JSON()
             with gr.Column():
                 with gr.Box():
                     gr.Markdown("<center><strong>Layout detection</strong></center>")
-                    image_output = gr.Image(type="numpy", label="Output Image")
-    btn.click(fn=analyze_image, inputs=[inputs, inputs_pdf, tok_input], outputs=[image_output, image_text, html, json])
-demo.launch()

 import os
 os.system('pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html')
+credentials_kwargs={"aws_access_key_id": os.environ["ACCESS_KEY"],"aws_secret_access_key": os.environ["SECRET_KEY"]}
 # work around: https://discuss.huggingface.co/t/how-to-install-a-specific-version-of-gradio-in-spaces/13552
 os.system("pip uninstall -y gradio")
 os.system("pip install gradio==3.4.1")
+os.system(os.environ["DD_ADDONS"])
 from os import getcwd, path, environ
 import deepdoctection as dd
 from deepdoctection.dataflow.serialize import DataFromList
+from dd_addons.extern import PdfTextDetector, PostProcessor, get_xsl_path
+from dd_addons.pipe.conn import PostProcessorService
 import gradio as gr
 _DD_ONE = "conf_dd_one.yaml"
+_XSL_PATH = get_xsl_path()
+dd.ModelCatalog.register("xrf_layout/model_final_inf_only.pt",dd.ModelProfile(
+            name="xrf_layout/model_final_inf_only.pt",
+            description="layout_detection/morning-dragon-114",
+            config="xrf_dd/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
             size=[274632215],
             tp_model=False,
+            hf_repo_id=environ.get("HF_REPO_LAYOUT"),
             hf_model_name="model_final_inf_only.pt",
             hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
             categories={"1": dd.LayoutType.text,
                         "3": dd.LayoutType.list,
                         "4": dd.LayoutType.table,
                         "5": dd.LayoutType.figure},
+            model_wrapper="D2FrcnnDetector",
+        ))
+dd.ModelCatalog.register("xrf_cell/model_final_inf_only.pt", dd.ModelProfile(
+            name="xrf_cell/model_final_inf_only.pt",
+            description="cell_detection/restful-eon-6",
+            config="xrf_dd/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
+            size=[274583063],
+            tp_model=False,
+            hf_repo_id=environ.get("HF_REPO_CELL"),
+            hf_model_name="model_final_inf_only.pt",
+            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
+            categories={"1": dd.LayoutType.cell},
+            model_wrapper="D2FrcnnDetector",
+        ))
+dd.ModelCatalog.register("xrf_item/model_final_inf_only.pt", dd.ModelProfile(
+            name="xrf_item/model_final_inf_only.pt",
+            description="item_detection/firm_plasma_14",
+            config="xrf_dd/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
+            size=[274595351],
+            tp_model=False,
+            hf_repo_id=environ.get("HF_REPO_ITEM"),
+            hf_model_name="model_final_inf_only.pt",
+            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
+            categories={"1": dd.LayoutType.row, "2": dd.LayoutType.column},
+            model_wrapper="D2FrcnnDetector",
         ))
 # Set up of the configuration and logging. Models are globally defined, so that they are not re-loaded once the input
 assert categories_item is not None
 d_item = dd.D2FrcnnDetector(item_config_path, item_weights_path, categories_item, device=cfg.DEVICE)
+# pdf miner
+pdf_text = PdfTextDetector(_XSL_PATH)
+# text detector
+tex_text = dd.TextractOcrDetector(**credentials_kwargs)
+def build_gradio_analyzer():
     """Building the Detectron2/DocTr analyzer based on the given config"""
     cfg.freeze(freezed=False)
+    cfg.TAB = True
+    cfg.TAB_REF = True
+    cfg.OCR = True
     cfg.freeze()
     pipe_component_list = []
     layout = dd.ImageLayoutService(d_layout, to_image=True, crop_image=True)
     pipe_component_list.append(layout)
+    nms_service = dd.AnnotationNmsService(nms_pairs=cfg.LAYOUT_NMS_PAIRS.COMBINATIONS,
+                                          thresholds=cfg.LAYOUT_NMS_PAIRS.THRESHOLDS)
+    pipe_component_list.append(nms_service)
     if cfg.TAB:
         detect_result_generator = dd.DetectResultGenerator(categories_cell)
         table_segmentation = dd.TableSegmentationService(
             cfg.SEGMENTATION.ASSIGNMENT_RULE,
+            cfg.SEGMENTATION.THRESHOLD_ROWS,
+            cfg.SEGMENTATION.THRESHOLD_COLS,
             cfg.SEGMENTATION.FULL_TABLE_TILING,
             cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
             cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
+            cfg.SEGMENTATION.STRETCH_RULE
         )
         pipe_component_list.append(table_segmentation)
             pipe_component_list.append(table_segmentation_refinement)
     if cfg.OCR:
+        d_text = dd.TextExtractionService(pdf_text)
         pipe_component_list.append(d_text)
+        t_text = dd.TextExtractionService(tex_text,skip_if_text_extracted=True)
+        pipe_component_list.append(t_text)
+        match_words = dd.MatchingService(
             parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
+            child_categories=cfg.WORD_MATCHING.CHILD_CATEGORIES,
             matching_rule=cfg.WORD_MATCHING.RULE,
+            threshold=cfg.WORD_MATCHING.THRESHOLD,
+            max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY
         )
+        pipe_component_list.append(match_words)
         order = dd.TextOrderService(
+            text_container=cfg.TEXT_ORDERING.TEXT_CONTAINER,
+            floating_text_block_names=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK,
+            text_block_names=cfg.TEXT_ORDERING.TEXT_BLOCK,
+            text_containers_to_text_block=cfg.TEXT_ORDERING.TEXT_CONTAINER_TO_TEXT_BLOCK
         )
         pipe_component_list.append(order)
     pipe = dd.DoctectionPipe(pipeline_component_list=pipe_component_list)
+    post_processor = PostProcessor("deepdoctection", **credentials_kwargs)
+    post_service = PostProcessorService(post_processor)
+    pipe_component_list.append(post_service)
+    return pipe
+def analyze_image(img, pdf, max_datapoints):
     # creating an image object and passing to the analyzer by using dataflows
+    analyzer = build_gradio_analyzer()
     if img is not None:
         image = dd.Image(file_name="input.png", location="")
         df = DataFromList(lst=[image])
         df = analyzer.analyze(dataset_dataflow=df)
     elif pdf:
+        df = analyzer.analyze(path=pdf.name, max_datapoints=max_datapoints)
     else:
         raise ValueError
     df.reset_state()
+    layout_items_str = ""
+    jsonl_out = []
+    dpts = []
+    html_list = []
+    for dp in df:
+        dpts.append(dp)
+        out = dp.as_dict()
+        jsonl_out.append(out)
+        out.pop("_image")
+        layout_items = dp.layouts
+        layout_items.sort(key=lambda x: x.reading_order)
+        layout_items_str += f"\n\n -------- PAGE NUMBER: {dp.page_number+1} ------------- \n"
+        for item in layout_items:
+            layout_items_str += f"\n {item.category_name}: {item.text}"
+        html_list.extend([table.html for table in dp.tables])
+    if html_list:
+        html = ("<br /><br /><br />").join(html_list)
+    else:
+        html = None
+    return [dp.viz(show_cells=False) for dp in dpts], layout_items_str, html, jsonl_out
 demo = gr.Blocks(css="scrollbar.css")
 with demo:
     with gr.Box():
         gr.Markdown("<h1><center>deepdoctection - A Document AI Package</center></h1>")
                     " and document layout analysis tasks using deep learning models. It does not implement models"
                     " but enables you to build pipelines using highly acknowledged libraries for object detection,"
                     " OCR and selected NLP tasks and provides an integrated frameworks for fine-tuning, evaluating"
+                    " and running models.<br />"
+                    "This pipeline consists of a stack of models powered by <strong>Detectron2"
+                    "</strong> for layout analysis and table recognition. OCR will be provided as well. You can process"
+                    "an image or even a PDF-document. Up to nine pages can be processed. <br />")
+        gr.Markdown("[https://github.com/deepdoctection/deepdoctection](https://github.com/deepdoctection/deepdoctection)")
     with gr.Box():
         gr.Markdown("<h2><center>Upload a document and choose setting</center></h2>")
         with gr.Row():
                 gr.Examples(examples=[path.join(getcwd(), "sample_3.pdf")], inputs = inputs_pdf)
         with gr.Row():
+            max_imgs = gr.Slider(1, 8, value=2, step=1, label="Number of pages in multi page PDF",
+                                 info="Will stop after 9 pages")
         with gr.Row():
             btn = gr.Button("Run model", variant="primary")
                 with gr.Box():
                     gr.Markdown("<center><strong>Contiguous text</strong></center>")
                     image_text = gr.Textbox()
             with gr.Column():
                 with gr.Box():
                     gr.Markdown("<center><strong>Layout detection</strong></center>")
+                    gallery = gr.Gallery(
+                        label="Output images", show_label=False, elem_id="gallery"
+                    ).style(grid=2)
+        with gr.Row():
+            with gr.Box():
+                gr.Markdown("<center><strong>Table</strong></center>")
+                html = gr.HTML()
+        with gr.Row():
+            with gr.Box():
+                gr.Markdown("<center><strong>JSON</strong></center>")
+                json = gr.JSON()
+    btn.click(fn=analyze_image, inputs=[inputs, inputs_pdf,  max_imgs],
+              outputs=[gallery, image_text, html, json])
+demo.launch()

conf_dd_one.yaml CHANGED Viewed

@@ -1,26 +1,65 @@
 CONFIG:
-  D2LAYOUT: dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml
-  D2CELL: dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml
-  D2ITEM: dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml
 WEIGHTS:
-  D2LAYOUT: layout/model_final_inf_only.pt
-  D2CELL: cell/d2_model_1849999_cell_inf_only.pt
-  D2ITEM: item/d2_model_1639999_item_inf_only.pt
 SEGMENTATION:
   ASSIGNMENT_RULE: ioa
-  IOU_THRESHOLD_ROWS: 0.01
-  IOU_THRESHOLD_COLS: 0.001
-  IOA_THRESHOLD_ROWS: 0.4
-  IOA_THRESHOLD_COLS: 0.4
   FULL_TABLE_TILING: True
-  REMOVE_IOU_THRESHOLD_ROWS: 0.001
-  REMOVE_IOU_THRESHOLD_COLS: 0.001
 WORD_MATCHING:
   PARENTAL_CATEGORIES:
-    - TEXT
-    - TITLE
-    - CELL
-    - LIST
   RULE: ioa
-  IOU_THRESHOLD: 0.001
-  IOA_THRESHOLD: 0.6

 CONFIG:
+  D2LAYOUT: xrf_dd/layout/CASCADE_RCNN_R_50_FPN_GN.yaml
+  D2CELL: xrf_dd/cell/CASCADE_RCNN_R_50_FPN_GN.yaml
+  D2ITEM: xrf_dd/item/CASCADE_RCNN_R_50_FPN_GN.yaml
 WEIGHTS:
+  D2LAYOUT: xrf_layout/model_final_inf_only.pt
+  D2CELL: xrf_cell/model_final_inf_only.pt
+  D2ITEM: xrf_item/model_final_inf_only.pt
+LAYOUT_NMS_PAIRS:
+  COMBINATIONS:
+    - - text
+      - table
+    - - title
+      - table
+    - - text
+      - list
+    - - title
+      - list
+    - - text
+      - title
+    - - list
+      - table
+  THRESHOLDS:
+    - 0.005
+    - 0.005
+    - 0.542
+    - 0.1
+    - 0.699
+    - 0.01
 SEGMENTATION:
   ASSIGNMENT_RULE: ioa
+  THRESHOLD_ROWS: 0.9
+  THRESHOLD_COLS: 0.9
   FULL_TABLE_TILING: True
+  REMOVE_IOU_THRESHOLD_ROWS: 0.5
+  REMOVE_IOU_THRESHOLD_COLS: 0.5
+  STRETCH_RULE: equal
+  USE_REFINEMENT: False
 WORD_MATCHING:
   PARENTAL_CATEGORIES:
+    - text
+    - title
+    - list
+    - figure
+    - cell
+  CHILD_CATEGORIES:
+    - word
   RULE: ioa
+  THRESHOLD: 0.4
+  MAX_PARENT_ONLY: True
+TEXT_ORDERING:
+  TEXT_CONTAINER: word
+  FLOATING_TEXT_BLOCK:
+    - title
+    - text
+    - list
+    - figure
+  TEXT_BLOCK:
+    - title
+    - text
+    - list
+    - cell
+    - figure
+  TEXT_CONTAINER_TO_TEXT_BLOCK: True
+DEVICE: cpu