Spaces:

Riksarkivet
/

htr_demo

Running on Zero

App Files Files Community

Gabriel commited on Jun 29, 2023

Commit

5ebeb73

0 Parent(s):

bad merge quick fix..

Browse files

Files changed (42) hide show

.dockerignore +13 -0
.gitattributes +35 -0
.github/workflows/sync_to_hub.yml +15 -0
.gitignore +23 -0
Dockerfile +36 -0
Makefile +19 -0
README.md +10 -0
app.py +522 -0
helper/__init__.py +0 -0
helper/examples/__init__.py +0 -0
helper/examples/examples.py +20 -0
helper/examples/images/.gitkeep +0 -0
helper/gradio_config.py +134 -0
helper/text/__init__.py +0 -0
helper/text/text_about.py +72 -0
helper/text/text_app.py +8 -0
helper/text/text_howto.py +94 -0
helper/text/text_riksarkivet.py +10 -0
helper/text/text_roadmap.py +17 -0
models/RmtDet_lines/rtmdet_m_textlines_2_concat.py +580 -0
models/RmtDet_regions/rtmdet_m_textregions_2_concat.py +380 -0
models/SATRN/_base_satrn_shallow_concat.py +318 -0
models/SATRN/dict1700.txt +148 -0
pyproject.toml +80 -0
requirements.txt +18 -0
src/htr_pipeline/__init__.py +0 -0
src/htr_pipeline/gradio_backend.py +143 -0
src/htr_pipeline/inferencer.py +159 -0
src/htr_pipeline/models.py +59 -0
src/htr_pipeline/pipeline.py +70 -0
src/htr_pipeline/utils/__init__.py +0 -0
src/htr_pipeline/utils/filter_segmask.py +127 -0
src/htr_pipeline/utils/helper.py +99 -0
src/htr_pipeline/utils/order_of_object.py +88 -0
src/htr_pipeline/utils/parser_xml.py +76 -0
src/htr_pipeline/utils/preprocess_img.py +19 -0
src/htr_pipeline/utils/process_segmask.py +87 -0
src/htr_pipeline/utils/process_xml.py +150 -0
src/htr_pipeline/utils/templates/arial.ttf +0 -0
src/htr_pipeline/utils/templates/page_xml_2013.xml +30 -0
src/tests/.gitkeep +0 -0
test_api.ipynb +479 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,13 @@

+.github/
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+.env
+Makefile
+page_txt.txt
+page_xml.xml
+helper/text/images/
+helper/text/videos/

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+helper/text/videos/eating_spaghetti.mp4 filter=lfs diff=lfs merge=lfs -text

.github/workflows/sync_to_hub.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+name: Sync to Hugging Face hub
+on: workflow_dispatch
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://Riksarkivet:$HF_TOKEN@huggingface.co/spaces/Riksarkivet/HTR_pipeline main

.gitignore ADDED Viewed

	@@ -0,0 +1,23 @@

+venv/
+.vscode/
+# Byte-compiled / optimized / DLL files
+*/__pycache__
+__pycache__/
+*.py[cod]
+vis_data/
+notebooks/
+output/
+my_xml_filename.xml
+models/RmtDet_regions/epoch_12.pth
+models/RmtDet_lines/epoch_12.pth
+models/SATRN/epoch_5.pth
+helper/examples/images/*.jpg
+flagged_data_points/
+src/htr_pipeline.egg-info/
+#
+page_xml.xml
+page_txt.txt
+transcribed_text.txt

Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+FROM python:3.10
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+#RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx \
+# mim openmmlabs installs
+RUN mim install mmengine
+RUN mim install mmcv
+RUN mim install mmdet
+RUN mim install mmocr
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# for localrun
+ENV AM_I_IN_A_DOCKER_CONTAINER Yes
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python", "app.py"]

Makefile ADDED Viewed

	@@ -0,0 +1,19 @@

+venv:
+	python -m venv venv
+activate:
+	source ./venv/bin/activate
+build:
+	pip install -e .
+	gradio app.py
+# clean_for_actions:
+# 	git lfs prune
+# 	git filter-branch --force --index-filter "git rm --cached --ignore-unmatch helper/text/videos/eating_spaghetti.mp4" --prune-empty --tag-name-filter cat -- --all
+# 	git push --force origin main
+# add_space:
+#	git remote add demo https://huggingface.co/spaces/Riksarkivet/htr_demo
+#	git push --force demo main

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: HTR Pipeline
+emoji: 🏢
+colorFrom: purple
+colorTo: green
+sdk: docker
+pinned: false
+models: []
+datasets: []
+---

app.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import gradio as gr
+from helper.examples.examples import ExamplesImages
+from helper.gradio_config import css, js, theme
+from helper.text.text_about import TextAbout
+from helper.text.text_app import TextApp
+from helper.text.text_howto import TextHowTo
+from helper.text.text_riksarkivet import TextRiksarkivet
+from helper.text.text_roadmap import TextRoadmap
+from htr_pipeline.gradio_backend import CustomTrack, FastTrack, SingletonModelLoader
+model_loader = SingletonModelLoader()
+fast_track = FastTrack(model_loader)
+custom_track = CustomTrack(model_loader)
+with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
+    gr.Markdown("&nbsp;")
+    gr.Markdown(TextApp.title_markdown)
+    with gr.Tabs():
+        with gr.Tab("HTR Tool"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    with gr.Row():
+                        fast_track_input_region_image = gr.Image(
+                            label="Image to run HTR on", type="numpy", tool="editor", elem_id="image_upload"
+                        ).style(height=395)
+                    with gr.Row():
+                        # with gr.Group():
+                        # callback = gr.CSVLogger()
+                        # # hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "htr_pipelin_flags")
+                        # flagging_button = gr.Button(
+                        #     "Flag",
+                        #     variant="secondary",
+                        #     visible=True,
+                        # ).style(full_width=True)
+                        radio_file_input = gr.Radio(
+                            value="Text file", choices=["Text file", "Page XML"], label="What kind file output?"
+                        )
+                        htr_pipeline_button = gr.Button(
+                            "Run HTR",
+                            variant="primary",
+                            visible=True,
+                            elem_id="run_pipeline_button",
+                        ).style(full_width=False)
+                    with gr.Group():
+                        with gr.Row():
+                            fast_file_downlod = gr.File(label="Download output file", visible=False)
+                        with gr.Row():
+                            with gr.Accordion("Example images to use:", open=False) as fast_example_accord:
+                                fast_name_files_placeholder = gr.Markdown(visible=False)
+                                gr.Examples(
+                                    examples=ExamplesImages.example_images_with_info,
+                                    inputs=[fast_track_input_region_image, fast_name_files_placeholder],
+                                    label="Example images",
+                                    examples_per_page=3,
+                                )
+                with gr.Column(scale=4):
+                    with gr.Row():
+                        fast_track_output_image = gr.Image(
+                            label="HTR results visualizer",
+                            type="numpy",
+                            tool="editor",
+                        ).style(height=650)
+                with gr.Row(visible=False) as api_placeholder:
+                    htr_pipeline_button_api = gr.Button(
+                        "Run pipeline",
+                        variant="primary",
+                        visible=False,
+                    ).style(full_width=False)
+                    xml_rendered_placeholder_for_api = gr.Textbox(visible=False)
+        with gr.Tab("Stepwise HTR Tool"):
+            with gr.Tabs():
+                with gr.Tab("1. Region Segmentation"):
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            name_files_placeholder = gr.Markdown(visible=False)
+                            with gr.Row():
+                                input_region_image = gr.Image(
+                                    label="Image to Region segment",
+                                    # type="numpy",
+                                    tool="editor",
+                                ).style(height=350)
+                            with gr.Accordion("Region segment settings:", open=False):
+                                with gr.Row():
+                                    reg_pred_score_threshold_slider = gr.Slider(
+                                        minimum=0.4,
+                                        maximum=1,
+                                        value=0.5,
+                                        step=0.05,
+                                        label="P-threshold",
+                                        info="""Filter and determine the confidence score
+                                                required for a prediction score to be considered""",
+                                    )
+                                    reg_containments_threshold_slider = gr.Slider(
+                                        minimum=0,
+                                        maximum=1,
+                                        value=0.5,
+                                        step=0.05,
+                                        label="C-threshold",
+                                        info="""The minimum required overlap or similarity
+                                                for a detected region or object to be considered valid""",
+                                    )
+                                with gr.Row():
+                                    region_segment_model_dropdown = gr.Dropdown(
+                                        choices=["Riksarkivet/RmtDet_region"],
+                                        value="Riksarkivet/RmtDet_region",
+                                        label="Region segment model",
+                                        info="Will add more models later!",
+                                    )
+                            with gr.Row():
+                                clear_button = gr.Button("Clear", variant="secondary", elem_id="clear_button")
+                                region_segment_button = gr.Button(
+                                    "Segment Region",
+                                    variant="primary",
+                                    elem_id="region_segment_button",
+                                )  # .style(full_width=False)
+                            with gr.Row():
+                                with gr.Accordion("Example images to use:", open=False) as example_accord:
+                                    gr.Examples(
+                                        examples=ExamplesImages.example_images_with_info,
+                                        inputs=[input_region_image, name_files_placeholder],
+                                        label="Example images",
+                                        examples_per_page=2,
+                                    )
+                        with gr.Column(scale=3):
+                            output_region_image = gr.Image(label="Segmented regions", type="numpy").style(height=600)
+                ##############################################
+                with gr.Tab("2. Line Segmentation"):
+                    image_placeholder_lines = gr.Image(
+                        label="Segmented lines",
+                        # type="numpy",
+                        interactive="False",
+                        visible=True,
+                    ).style(height=600)
+                    with gr.Row(visible=False) as control_line_segment:
+                        with gr.Column(scale=2):
+                            with gr.Box():
+                                regions_cropped_gallery = gr.Gallery(
+                                    label="Segmented regions",
+                                    show_label=False,
+                                    elem_id="gallery",
+                                ).style(
+                                    columns=[2],
+                                    rows=[2],
+                                    # object_fit="contain",
+                                    height=300,
+                                    preview=True,
+                                    container=False,
+                                )
+                            input_region_from_gallery = gr.Image(
+                                label="Region segmentation to line segment", interactive="False", visible=False
+                            ).style(height=400)
+                            with gr.Row():
+                                with gr.Accordion("Line segment settings:", open=False):
+                                    with gr.Row():
+                                        line_pred_score_threshold_slider = gr.Slider(
+                                            minimum=0.3,
+                                            maximum=1,
+                                            value=0.4,
+                                            step=0.05,
+                                            label="Pred_score threshold",
+                                            info="""Filter and determine the confidence score
+                                                    required for a prediction score to be considered""",
+                                        )
+                                        line_containments_threshold_slider = gr.Slider(
+                                            minimum=0,
+                                            maximum=1,
+                                            value=0.5,
+                                            step=0.05,
+                                            label="Containments threshold",
+                                            info="""The minimum required overlap or similarity
+                                                    for a detected region or object to be considered valid""",
+                                        )
+                                    with gr.Row().style(equal_height=False):
+                                        line_segment_model_dropdown = gr.Dropdown(
+                                            choices=["Riksarkivet/RmtDet_lines"],
+                                            value="Riksarkivet/RmtDet_lines",
+                                            label="Line segment model",
+                                            info="Will add more models later!",
+                                        )
+                            with gr.Row():
+                                clear_line_segment_button = gr.Button(
+                                    " ",
+                                    variant="Secondary",
+                                    # elem_id="center_button",
+                                ).style(full_width=True)
+                                line_segment_button = gr.Button(
+                                    "Segment Lines",
+                                    variant="primary",
+                                    # elem_id="center_button",
+                                ).style(full_width=True)
+                        with gr.Column(scale=3):
+                            # gr.Markdown("""lorem ipsum""")
+                            output_line_from_region = gr.Image(
+                                label="Segmented lines",
+                                type="numpy",
+                                interactive="False",
+                            ).style(height=600)
+                ###############################################
+                with gr.Tab("3. Transcribe Text"):
+                    image_placeholder_htr = gr.Image(
+                        label="Transcribed lines",
+                        # type="numpy",
+                        interactive="False",
+                        visible=True,
+                    ).style(height=600)
+                    with gr.Row(visible=False) as control_htr:
+                        inputs_lines_to_transcribe = gr.Variable()
+                        with gr.Column(scale=2):
+                            image_inputs_lines_to_transcribe = gr.Image(
+                                label="Transcribed lines",
+                                type="numpy",
+                                interactive="False",
+                                visible=False,
+                            ).style(height=470)
+                            with gr.Row():
+                                with gr.Accordion("Transcribe settings:", open=False):
+                                    transcriber_model = gr.Dropdown(
+                                        choices=["Riksarkivet/SATRN_transcriber", "microsoft/trocr-base-handwritten"],
+                                        value="Riksarkivet/SATRN_transcriber",
+                                        label="Transcriber model",
+                                        info="Will add more models later!",
+                                    )
+                            with gr.Row():
+                                clear_transcribe_button = gr.Button(" ", variant="Secondary", visible=True).style(
+                                    full_width=True
+                                )
+                                transcribe_button = gr.Button(
+                                    "Transcribe lines", variant="primary", visible=True
+                                ).style(full_width=True)
+                                donwload_txt_button = gr.Button(
+                                    "Download text", variant="secondary", visible=False
+                                ).style(full_width=True)
+                            with gr.Row():
+                                txt_file_downlod = gr.File(label="Download text", visible=False)
+                        with gr.Column(scale=3):
+                            with gr.Row():
+                                transcribed_text_df = gr.Dataframe(
+                                    headers=["Transcribed text"],
+                                    max_rows=15,
+                                    col_count=(1, "fixed"),
+                                    wrap=True,
+                                    interactive=False,
+                                    overflow_row_behaviour="paginate",
+                                ).style(height=600)
+                #####################################
+                with gr.Tab("4. Explore Results"):
+                    image_placeholder_explore_results = gr.Image(
+                        label="Cropped transcribed lines",
+                        # type="numpy",
+                        interactive="False",
+                        visible=True,
+                    ).style(height=600)
+                    with gr.Row(visible=False) as control_results_transcribe:
+                        with gr.Column(scale=1, visible=True):
+                            with gr.Box():
+                                temp_gallery_input = gr.Variable()
+                                gallery_inputs_lines_to_transcribe = gr.Gallery(
+                                    label="Cropped transcribed lines",
+                                    show_label=True,
+                                    elem_id="gallery_lines",
+                                ).style(
+                                    columns=[3],
+                                    rows=[3],
+                                    # object_fit="contain",
+                                    # height="600",
+                                    preview=True,
+                                    container=False,
+                                )
+                        with gr.Column(scale=1, visible=True):
+                            mapping_dict = gr.Variable()
+                            transcribed_text_df_finish = gr.Dataframe(
+                                headers=["Transcribed text", "HTR prediction score"],
+                                max_rows=15,
+                                col_count=(2, "fixed"),
+                                wrap=True,
+                                interactive=False,
+                                overflow_row_behaviour="paginate",
+                            ).style(height=600)
+        with gr.Tab("How to use"):
+            with gr.Tabs():
+                with gr.Tab("HTR Tool"):
+                    with gr.Row().style(equal_height=False):
+                        with gr.Column():
+                            gr.Markdown(TextHowTo.htr_tool)
+                        with gr.Column():
+                            gr.Markdown(TextHowTo.both_htr_tool_video)
+                            gr.Video(
+                                value="https://github.com/Borg93/htr_gradio_file_placeholder/raw/main/eating_spaghetti.mp4",
+                                label="How to use HTR Tool",
+                            )
+                            gr.Markdown(TextHowTo.reach_out)
+                with gr.Tab("Stepwise HTR Tool"):
+                    with gr.Row().style(equal_height=False):
+                        with gr.Column():
+                            gr.Markdown(TextHowTo.stepwise_htr_tool)
+                            with gr.Row():
+                                with gr.Accordion("The tabs for the Stepwise HTR Tool:", open=False):
+                                    with gr.Tabs():
+                                        with gr.Tab("1. Region Segmentation"):
+                                            gr.Markdown(TextHowTo.stepwise_htr_tool_tab1)
+                                        with gr.Tab("2. Line Segmentation"):
+                                            gr.Markdown(TextHowTo.stepwise_htr_tool_tab2)
+                                        with gr.Tab("3. Transcribe Text"):
+                                            gr.Markdown(TextHowTo.stepwise_htr_tool_tab3)
+                                        with gr.Tab("4. Explore Results"):
+                                            gr.Markdown(TextHowTo.stepwise_htr_tool_tab4)
+                            gr.Markdown(TextHowTo.stepwise_htr_tool_end)
+                        with gr.Column():
+                            gr.Markdown(TextHowTo.both_htr_tool_video)
+                            gr.Video(
+                                value="https://github.com/Borg93/htr_gradio_file_placeholder/raw/main/eating_spaghetti.mp4",
+                                label="How to use Stepwise HTR Tool",
+                            )
+                            gr.Markdown(TextHowTo.reach_out)
+        with gr.Tab("About"):
+            with gr.Tabs():
+                with gr.Tab("Project"):
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Markdown(TextAbout.intro_and_pipeline_overview_text)
+                            with gr.Row():
+                                with gr.Tabs():
+                                    with gr.Tab("I. Binarization"):
+                                        gr.Markdown(TextAbout.binarization)
+                                    with gr.Tab("II. Region Segmentation"):
+                                        gr.Markdown(TextAbout.text_region_segment)
+                                    with gr.Tab("III. Line Segmentation"):
+                                        gr.Markdown(TextAbout.text_line_segmentation)
+                                    with gr.Tab("IV. Transcriber"):
+                                        gr.Markdown(TextAbout.text_htr)
+                            with gr.Row():
+                                gr.Markdown(TextAbout.text_data)
+                        with gr.Column():
+                            gr.Markdown(TextAbout.filler_text_data)
+                            gr.Markdown(TextAbout.text_models)
+                            with gr.Row():
+                                with gr.Tabs():
+                                    with gr.Tab("Region Segmentation"):
+                                        gr.Markdown(TextAbout.text_models_region)
+                                    with gr.Tab("Line Segmentation"):
+                                        gr.Markdown(TextAbout.text_line_segmentation)
+                                    with gr.Tab("Transcriber"):
+                                        gr.Markdown(TextAbout.text_models_htr)
+                with gr.Tab("Roadmap"):
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Markdown(TextRoadmap.roadmap)
+                        with gr.Column():
+                            gr.Markdown(TextRoadmap.notebook)
+                with gr.Tab("Riksarkivet"):
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Markdown(TextRiksarkivet.riksarkivet)
+                        with gr.Column():
+                            gr.Markdown(TextRiksarkivet.contact)
+    htr_pipeline_button.click(
+        fast_track.segment_to_xml,
+        inputs=[fast_track_input_region_image, radio_file_input],
+        outputs=[fast_track_output_image, fast_file_downlod, fast_file_downlod],
+    )
+    htr_pipeline_button_api.click(
+        fast_track.segment_to_xml_api,
+        inputs=[fast_track_input_region_image],
+        outputs=[xml_rendered_placeholder_for_api],
+        api_name="predict",
+    )
+    # fast_track_input_region_image.change(
+    #     fn=lambda: (gr.Accordion.update(open=False)),
+    #     outputs=[fast_example_accord],
+    # )
+    # input_region_image.change(
+    #     fn=lambda: (gr.Accordion.update(open=False)),
+    #     outputs=[example_accord],
+    # )
+    # callback.setup([fast_track_input_region_image], "flagged_data_points")
+    # flagging_button.click(lambda *args: callback.flag(args), [fast_track_input_region_image], None, preprocess=False)
+    # flagging_button.click(lambda: (gr.update(value="Flagged")), outputs=flagging_button)
+    # fast_track_input_region_image.change(lambda: (gr.update(value="Flag")), outputs=flagging_button)
+    # custom track
+    region_segment_button.click(
+        custom_track.region_segment,
+        inputs=[input_region_image, reg_pred_score_threshold_slider, reg_containments_threshold_slider],
+        outputs=[output_region_image, regions_cropped_gallery, image_placeholder_lines, control_line_segment],
+    )
+    regions_cropped_gallery.select(
+        custom_track.get_select_index_image, regions_cropped_gallery, input_region_from_gallery
+    )
+    transcribed_text_df_finish.select(
+        fn=custom_track.get_select_index_df,
+        inputs=[transcribed_text_df_finish, mapping_dict],
+        outputs=gallery_inputs_lines_to_transcribe,
+    )
+    line_segment_button.click(
+        custom_track.line_segment,
+        inputs=[input_region_from_gallery, line_pred_score_threshold_slider, line_containments_threshold_slider],
+        outputs=[
+            output_line_from_region,
+            image_inputs_lines_to_transcribe,
+            inputs_lines_to_transcribe,
+            gallery_inputs_lines_to_transcribe,
+            temp_gallery_input,
+            # Hide
+            transcribe_button,
+            image_inputs_lines_to_transcribe,
+            image_placeholder_htr,
+            control_htr,
+        ],
+    )
+    transcribe_button.click(
+        custom_track.transcribe_text,
+        inputs=[transcribed_text_df, inputs_lines_to_transcribe],
+        outputs=[
+            transcribed_text_df,
+            transcribed_text_df_finish,
+            mapping_dict,
+            txt_file_downlod,
+            control_results_transcribe,
+            image_placeholder_explore_results,
+        ],
+    )
+    donwload_txt_button.click(
+        custom_track.download_df_to_txt,
+        inputs=transcribed_text_df,
+        outputs=[txt_file_downlod, txt_file_downlod],
+    )
+    clear_button.click(
+        lambda: (
+            None,
+            None,
+            None,
+            gr.update(visible=False),
+            None,
+            None,
+            None,
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            None,
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=True),
+        ),
+        inputs=[],
+        outputs=[
+            input_region_image,
+            regions_cropped_gallery,
+            input_region_from_gallery,
+            control_line_segment,
+            output_line_from_region,
+            inputs_lines_to_transcribe,
+            transcribed_text_df,
+            control_htr,
+            inputs_lines_to_transcribe,
+            image_placeholder_htr,
+            output_region_image,
+            image_inputs_lines_to_transcribe,
+            control_results_transcribe,
+            image_placeholder_explore_results,
+            image_placeholder_lines,
+        ],
+    )
+    demo.load(None, None, None, _js=js)
+demo.queue(concurrency_count=5, max_size=20)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False, show_error=True)

helper/__init__.py ADDED Viewed

File without changes

helper/examples/__init__.py ADDED Viewed

File without changes

helper/examples/examples.py ADDED Viewed

	@@ -0,0 +1,20 @@

+class ExamplesImages:
+    image_path = "./helper/examples/images"
+    example_images_with_info = [
+        [f"{image_path}/1664-Handelskollegiet_A1_0014full.jpg", "1664 HandelsKollegiet"],
+        [
+            f"{image_path}/1735-Södra_förstadens_kämnärsrätt_00042-scan_2020-10-13_14-03-37.jpg",
+            "1735 Södra förstadens kämnärsrätt",
+        ],
+        [f"{image_path}/1777-Hall-_och_Manufakturrätten_HallMan_Sida_03.jpg", "1777 Hall och Manufakturrätten"],
+        [f"{image_path}/1840-1890_H0000304_00034.jpg", "1840-1890 --"],
+        [f"{image_path}/1861_R0000277_00153.jpg", "1861 --"],
+        [f"{image_path}/1664-Handelskollegiet_A1_0014full.jpg", "1664 HandelsKollegiet"],
+        [
+            f"{image_path}/1735-Södra_förstadens_kämnärsrätt_00042-scan_2020-10-13_14-03-37.jpg",
+            "1735 Södra förstadens kämnärsrätt",
+        ],
+        [f"{image_path}/1777-Hall-_och_Manufakturrätten_HallMan_Sida_03.jpg", "1777 Hall och Manufakturrätten"],
+        [f"{image_path}/1840-1890_H0000304_00034.jpg", "1840-1890 --"],
+        [f"{image_path}/1861_R0000277_00153.jpg", "1861 --"],
+    ]

helper/examples/images/.gitkeep ADDED Viewed

File without changes

helper/gradio_config.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import gradio as gr
+class GradioConfig:
+    def __init__(self, tooltip_dict):
+        self.tooltip_dict = tooltip_dict
+        self.theme = gr.themes.Base(
+            primary_hue="blue",
+            secondary_hue="blue",
+            neutral_hue="slate",
+            font=[
+                gr.themes.GoogleFont("Open Sans"),
+                "ui-sans-serif",
+                "system-ui",
+                "sans-serif",
+            ],
+        )
+        self.css = """
+        footer {display: none !important;}
+        #image_upload {min-height:450}
+        #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 450px}
+        #gallery {height: 400px}
+        .fixed-height.svelte-g4rw9.svelte-g4rw9 {min-height: 400px;}
+                """
+    def generate_tooltip_css(self):
+        temp_css_list = [self.css]
+        for button_id, tooltip_text in self.tooltip_dict.items():
+            temp_css_list.append(self.template_tooltip_css(button_id, tooltip_text))
+        return "\n".join(temp_css_list)
+    def template_tooltip_css(self, button_id, tooltip_text):
+        return f"""
+        /* For tooltip */
+        #{button_id} {{
+            position: relative;
+        }}
+        #{button_id}::before {{
+            visibility: hidden;
+            content: '';
+            position: absolute;
+            bottom: 100%; /* Position on top of the parent element */
+            left: 50%;
+            margin-left: 5px; /* Adjust for the desired space between the button and tooltip */
+            transform: translateY(-50%);
+            border-width: 7px;
+            border-style: solid;
+            border-color: rgba(51, 51, 51, 0) transparent transparent rgba(51, 51, 51, 0);
+            transition: opacity 0.4s ease-in-out, border-color 0.4s ease-in-out;
+            opacity: 0;
+            z-index: 999;
+        }}
+        #{button_id}::after {{
+            visibility: hidden;
+            content: '{tooltip_text}';
+            position: absolute;
+            bottom: 100%; /* Position on top of the parent element */
+            left: 42%;
+            background-color: rgba(51, 51, 51, 0);
+            color: white;
+            padding: 5px;
+            border-radius: 3px;
+            z-index: 998;
+            opacity: 0;
+            transition: opacity 0.4s ease-in-out, background-color 0.4s ease-in-out;
+            margin-bottom: 20px !important; /* Increased from 18px to 23px to move tooltip 5px upwards */
+            margin-left: 0px; /* Adjust for the arrow width and the desired space between the arrow and tooltip */
+            white-space: normal; /* Allows the text to wrap */
+            width: 200px; /* Maximum line length before wrapping */
+            box-sizing: border-box;
+        }}
+        #{button_id}.showTooltip::before {{
+            visibility: visible;
+            opacity: 1;
+            border-color: rgba(51, 51, 51, 0.7) transparent transparent rgba(51, 51, 51, 0.7);
+        }}
+        #{button_id}.showTooltip::after {{
+            visibility: visible;
+            opacity: 1;
+            background-color: rgba(51, 51, 51, 0.7);
+        }}
+        """
+    def add_interaction_to_buttons(self):
+        button_ids_list = ", ".join([f"'#{id}'" for id, _ in self.tooltip_dict.items()])
+        button_ids = button_ids_list.replace("'", "")
+        return f"""
+        function monitorButtonHover() {{
+            gradioURL = window.location.href
+            if (!gradioURL.endsWith('?__theme=dark')) {{
+                window.location.replace(gradioURL + '?__theme=dark');
+            }}
+            const buttons = document.querySelectorAll('{button_ids}');
+            buttons.forEach(function(button) {{
+                button.addEventListener('mouseenter', function() {{
+                    this.classList.add('showTooltip');
+                }});
+                button.addEventListener('mouseleave', function() {{
+                    this.classList.remove('showTooltip');
+                }});
+            }})
+        }}
+        """
+buttons_with_tooltip = {
+    "run_pipeline_button": "Runs HTR on the image. Takes approx 1-2 mins per image (depending on hardware).",
+    "clear_button": "Clears all states and resets the entire workflow in the stepwise tool.",
+    "region_segment_button": "Segments text regions in the chosen image with the chosen settings.",
+    "line_segment_button": "Segments chosen regions from the image gallery into lines segments.",
+    "transcribe_button": "Transcribes each line segment into text and streams back the data.",
+}
+gradio_config = GradioConfig(buttons_with_tooltip)
+theme = gradio_config.theme
+css = gradio_config.generate_tooltip_css()
+js = gradio_config.add_interaction_to_buttons()
+if __name__ == "__main__":
+    tooltip = GradioConfig({"run_pipeline_button": "this is a tooltop", "clear_button": "this is a tooltop"})
+    css = tooltip.generate_tooltip_css()
+    js = tooltip.add_interaction_to_buttons()
+    print(css)
+    print(js)

helper/text/__init__.py ADDED Viewed

File without changes

helper/text/text_about.py ADDED Viewed

	@@ -0,0 +1,72 @@

+class TextAbout:
+    # About text
+    intro_and_pipeline_overview_text = """
+    ## Introduction
+    The Swedish National Archives presents an end-to-end HTR-pipeline consisting of two RTMDet instance segmentation models, trained with MMDetection, one for segmenting text-regions, and one for segmenting text-lines within these regions, and one SATRN HTR-model trained with MMOCR. The aim is for a generic pipline for running-text documents ranging from 1600 to 1900. We will retrain and update the models continually as more data becomes avaialable. Feel free to try out the pipline yourself in our interactive demo (reference).
+    ## The Pipeline in Overview
+    The steps in the pipeline are as follows:
+    """
+    binarization = """
+    ### Binarization
+    The reason for binarizing the images before processing them is that we want the models to generalize as well as possible.
+    By training on only binarized images and by binarizing images before running them through the pipeline, we take the target domain closer to the training domain, and ruduce negative effects of background variation, background noise etc., on the final results.
+    The pipeline implements a simple adaptive thresholding algorithm for binarization.
+"""
+    text_region_segment = """
+    ### Text-region segmentation
+    To facilitate the text-line segmentation process, it is advantageous to segment the image into text-regions beforehand. This initial step offers several benefits, including reducing variations in line spacing, eliminating blank areas on the page, establishing a clear reading order, and distinguishing marginalia from the main text.
+    The segmentation model utilized in this process predicts both bounding boxes and masks. Although the model has the capability to predict both, only the masks are utilized for the segmentation tasks of lines and regions.
+    An essential post-processing step involves checking for regions that are contained within other regions. During this step, only the containing region is retained, while the contained region is discarded. This ensures that the final segmented text-regions are accurate and devoid of overlapping or redundant areas.
+"""
+    text_line_segmentation = """
+    ### Text-line segmentation
+    This is also an RTMDet model that's trained on extracting text-lines from cropped text-regions within an image.
+    The same post-processing on the instance segmentation masks is done here as in the text-region segmentation step.
+"""
+    text_htr = """
+    ### HTR
+    For the text-recognition a SATRN model (reference) was trained with mmocr on approximately one million handwritten text-line images ranging from 1600 to 1900.
+    It was trained on a wide variety of archival material to make it generalize as well as possible. See below for detailed evaluation results, and also some finetuning experiments.
+    """
+    text_data = """
+    ## The Data
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    """
+    filler_text_data = """
+    ## &nbsp;
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    """
+    text_models = """
+    ## The Models
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    """
+    text_models_region = """
+    ### Text-Region Segmentation
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+"""
+    text_models_lines = """
+    ### Text-Line Segmentation
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    """
+    text_models_htr = """
+    ### HTR
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    """

helper/text/text_app.py ADDED Viewed

	@@ -0,0 +1,8 @@

+class TextApp:
+    title_markdown = """
+    <img src="https://raw.githubusercontent.com/Borg93/Riksarkivet_docs/main/docs/assets/fav-removebg-preview.png" width="4%" align="right" margin-right="100" />
+    <h1><center> Handwritten Text Recognition Tool </center></h1>
+    <h3><center> Swedish National Archives - Riksarkivet </center></h3>"""

helper/text/text_howto.py ADDED Viewed

	@@ -0,0 +1,94 @@

+class TextHowTo:
+    htr_tool = """
+    ## Getting Started with the HTR Tool
+    To quickly run the HTR Tool and transcribe handwritten text, follow these steps:
+    1. Open the HTR Tool tab.
+    2. Upload an image or choose an image from the provided Examples (under "Example images to use:" accordin).
+       Note that the accordin works like a "dropdown" and that you just need to press an example to use it (also, use the pagniation at the bottom to view more examples).
+    3. The radio button specifes the output file extension, which can be either text or page XML.
+    4. Click the "Run HTR" button to initiate the HTR process. You can refer to the screenshot below:
+    <figure>
+    <img src="https://raw.githubusercontent.com/Borg93/htr_gradio_file_placeholder/main/htr_run_example.png" alt="HTR_tool" style="width:65%; display: block; margin-left: auto; margin-right:auto;" >
+    <figcaption style="text-align: center;"> <em> Figure - How to Run the HTR Tool </em></figcaption>
+    </figure>
+    The HTR Tool will transform an image of handwritten text into structured, transcribed text within approximately 1-2 minutes (depending on your hardware).
+    Note that the generated page XML file is strucutred in such manner that it allows for an easy integration with other software, such as Transkribus.
+    <br>
+"""
+    reach_out = """  Feel free to reach out if you have any questions or need further assistance!
+    """
+    stepwise_htr_tool = """
+## Stepwise HTR Tool
+The Stepwise HTR Tool is a powerful tool for performing Handwritten Text Recognition (HTR) tasks. The Stepwise version provides you with fine-grained control over each step of the HTR process, allowing for greater customization and troubleshooting capabilities.
+With the Stepwise HTR Tool, you can break down the HTR process into distinct steps: region segmentation, line segmentation, text transcription, and result exploration.
+This tool offers a range of configuration options to tailor the HTR process to your specific needs. You can adjust settings such as P-threshold and C-threshold to fine-tune the region and line segmentation, and choose from a selection of underlying machine learning models to drive each step of the process.
+The Stepwise HTR Tool also provides a dedicated Explore Results tab, allowing you to thoroughly analyze and interact with the transcriptions. You can sort and identify both bad and good predictions, helping you gain insights and make improvements to the HTR accuracy.
+Each step is interconnected, and the output of one step serves as the input for the next step, ensuring a seamless and efficient workflow.
+<br><br>
+Follow the instructions below provided in each tab to perform the respective step of the HTR process and ensure you work through the tabs sequentially:
+"""
+    stepwise_htr_tool_tab1 = """
+### Tab 1: Region Segmentation
+The Region Segmentation tab allows you to perform the initial step of segmenting the handwritten text into regions of interest. By adjusting the P-threshold and C-threshold settings, you can control the confidence score required for a prediction and the minimum overlap or similarity for a detected region to be considered valid. Additionally, you can select an underlying machine learning model for region segmentation.
+<br><br>
+To perform region segmentation, follow these steps:
+1. Open the "Region Segmentation" tab.
+2. Upload an image or choose an image from the provided Examples (under "Example images to use:" accordin).
+3. Configure the region segmentation settings:
+   - Adjust the P-threshold: Filter and determine the confidence score required for a prediction score to be considered.
+   - Adjust the C-threshold: Set the minimum required overlap or similarity for a detected region or object to be considered valid.
+   - Select an underlying machine learning model.
+4. Click the "Run Region Segmentation" button to initiate the region segmentation process.
+"""
+    stepwise_htr_tool_tab2 = """
+### Tab 2: Line Segmentation
+In the Line Segmentation tab, you can further refine the segmentation process by identifying individual lines of text.
+Similar to the Region Segmentation tab, you can adjust the P-threshold and C-threshold settings for line segmentation and choose an appropriate machine learning model.
+<br><br>
+To perform line segmentation, follow these steps:
+1. Open the "Line Segmentation" tab.
+2. Choice a segmented region from image gallery, which populated with the results from the previous tab.
+3. Configure the line segmentation settings:
+   - Adjust the P-threshold: Filter and determine the confidence score required for a prediction score to be considered.
+   - Adjust the C-threshold: Set the minimum required overlap or similarity for a detected region or object to be considered valid.
+   - Select an underlying machine learning model.
+4. Click the "Run Line Segmentation" button to initiate the line segmentation process.
+"""
+    stepwise_htr_tool_tab3 = """
+### Tab 3: Transcribe Text
+The Transcribe Text tab allows you to convert the segmented text into transcriptions. Here, you can select the desired machine learning model for text transcription.
+<br><br>
+To transcribe text, follow these steps:
+1. Open the "Transcribe Text" tab.
+2. The image to transcribe is predefined with the results from the previous tab.
+3. Configure the text transcription settings:
+   - Select an underlying machine learning model.
+4. Click the "Run Text Transcription" button to initiate the text transcription process.
+"""
+    stepwise_htr_tool_tab4 = """
+### Tab 4: Explore Results
+Once the transcription is complete, you can explore the results in the Explore Results tab. This tab provides various features for analyzing and interacting with the transcriptions, allowing you to sort and identify both bad and good predictions.
+<br><br>
+To explore the HTR results, follow these steps:
+1. Open the "Explore Results" tab.
+2. Analyze the generated results. The image gallery of cropped text line segments is bi-directional coupled through interaction with the dataframe on the left.
+3. Use the provided features, such as the prediction score to sort and interact with the image gallery, identifying both bad and good transcriptions.
+"""
+    stepwise_htr_tool_end = """
+    As mentioned, please note that each tab in this workflow is dependent on the previous steps, where you progressively work through the process in a step-by-step manner.
+    <br>
+    """
+    both_htr_tool_video = """
+        ## &nbsp;
+        Alternatively, you can watch the instructional video below, which provides a step-by-step walkthrough of the HTR Tool and some additional features.
+    """

helper/text/text_riksarkivet.py ADDED Viewed

	@@ -0,0 +1,10 @@

+class TextRiksarkivet:
+    riksarkivet = """
+    ## Riksarkivet
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+"""
+    contact = """
+    ## Contact us
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+"""

helper/text/text_roadmap.py ADDED Viewed

	@@ -0,0 +1,17 @@

+class TextRoadmap:
+    roadmap = """
+    ## Roadmap
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    """
+    notebook = """
+    ## Using the models
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    ## Implementing the Whole Pipeline
+    * add notebook as an example (for api)
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+"""

models/RmtDet_lines/rtmdet_m_textlines_2_concat.py ADDED Viewed

	@@ -0,0 +1,580 @@

+default_scope = 'mmdet'
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', interval=1, max_keep_ckpts=5, save_best='auto'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='DetVisualizationHook'))
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='DetLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend')],
+    name='visualizer',
+    save_dir='./')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+log_level = 'INFO'
+load_from = '/home/erik/Riksarkivet/Projects/HTR_Pipeline/models/checkpoints/rtmdet_lines_pr_2/epoch_11.pth'
+resume = True
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=12,
+    val_interval=12,
+    dynamic_intervals=[(10, 1)])
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(
+    type='TestLoop',
+    pipeline=[
+        dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+        dict(type='Resize', scale=(640, 640), keep_ratio=True),
+        dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+        dict(
+            type='PackDetInputs',
+            meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                       'scale_factor'))
+    ])
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-05, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=1.25e-05,
+        begin=6,
+        end=12,
+        T_max=6,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.00025, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+file_client_args = dict(backend='disk')
+train_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='Resize', scale=(640, 640), keep_ratio=True),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+tta_model = dict(
+    type='DetTTAModel',
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.6), max_per_img=100))
+img_scales = [(640, 640), (320, 320), (960, 960)]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='TestTimeAug',
+        transforms=[[{
+            'type': 'Resize',
+            'scale': (640, 640),
+            'keep_ratio': True
+        }, {
+            'type': 'Resize',
+            'scale': (320, 320),
+            'keep_ratio': True
+        }, {
+            'type': 'Resize',
+            'scale': (960, 960),
+            'keep_ratio': True
+        }],
+                    [{
+                        'type': 'RandomFlip',
+                        'prob': 1.0
+                    }, {
+                        'type': 'RandomFlip',
+                        'prob': 0.0
+                    }],
+                    [{
+                        'type': 'Pad',
+                        'size': (960, 960),
+                        'pad_val': {
+                            'img': (114, 114, 114)
+                        }
+                    }],
+                    [{
+                        'type':
+                        'PackDetInputs',
+                        'meta_keys':
+                        ('img_id', 'img_path', 'ori_shape', 'img_shape',
+                         'scale_factor', 'flip', 'flip_direction')
+                    }]])
+]
+model = dict(
+    type='RTMDet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False,
+        batch_augments=None),
+    backbone=dict(
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=0.67,
+        widen_factor=0.75,
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    neck=dict(
+        type='CSPNeXtPAFPN',
+        in_channels=[192, 384, 768],
+        out_channels=192,
+        num_csp_blocks=2,
+        expand_ratio=0.5,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    bbox_head=dict(
+        type='RTMDetInsSepBNHead',
+        num_classes=80,
+        in_channels=192,
+        stacked_convs=2,
+        share_conv=True,
+        pred_kernel_size=1,
+        feat_channels=192,
+        act_cfg=dict(type='SiLU', inplace=True),
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        anchor_generator=dict(
+            type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]),
+        bbox_coder=dict(type='DistancePointBBoxCoder'),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_mask=dict(
+            type='DiceLoss', loss_weight=2.0, eps=5e-06, reduction='mean')),
+    train_cfg=dict(
+        assigner=dict(type='DynamicSoftLabelAssigner', topk=13),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=400,
+        min_bbox_size=0,
+        score_thr=0.4,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=50,
+        mask_thr_binary=0.5))
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='RandomResize',
+        scale=(640, 640),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=1,
+    batch_sampler=None,
+    pin_memory=True,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='ConcatDataset',
+        datasets=[
+            dict(
+                type='CocoDataset',
+                metainfo=dict(classes='text_line', palette=[(220, 20, 60)]),
+                data_prefix=dict(
+                    img=
+                    '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/'
+                ),
+                ann_file=
+                '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json',
+                pipeline=[
+                    dict(
+                        type='LoadImageFromFile',
+                        file_client_args=dict(backend='disk')),
+                    dict(
+                        type='LoadAnnotations',
+                        with_bbox=True,
+                        with_mask=True,
+                        poly2mask=False),
+                    dict(
+                        type='CachedMosaic',
+                        img_scale=(640, 640),
+                        pad_val=114.0),
+                    dict(
+                        type='RandomResize',
+                        scale=(1280, 1280),
+                        ratio_range=(0.1, 2.0),
+                        keep_ratio=True),
+                    dict(
+                        type='RandomCrop',
+                        crop_size=(640, 640),
+                        recompute_bbox=True,
+                        allow_negative_crop=True),
+                    dict(type='YOLOXHSVRandomAug'),
+                    dict(type='RandomFlip', prob=0.5),
+                    dict(
+                        type='Pad',
+                        size=(640, 640),
+                        pad_val=dict(img=(114, 114, 114))),
+                    dict(
+                        type='CachedMixUp',
+                        img_scale=(640, 640),
+                        ratio_range=(1.0, 1.0),
+                        max_cached_images=20,
+                        pad_val=(114, 114, 114)),
+                    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+                    dict(type='PackDetInputs')
+                ])
+        ]))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=10,
+    dataset=dict(
+        pipeline=[
+            dict(
+                type='LoadImageFromFile',
+                file_client_args=dict(backend='disk')),
+            dict(type='Resize', scale=(640, 640), keep_ratio=True),
+            dict(
+                type='Pad', size=(640, 640),
+                pad_val=dict(img=(114, 114, 114))),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ],
+        type='CocoDataset',
+        metainfo=dict(classes='text_line', palette=[(220, 20, 60)]),
+        data_prefix=dict(
+            img=
+            '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/'
+        ),
+        ann_file=
+        '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions2.json',
+        test_mode=True),
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=10,
+    dataset=dict(
+        pipeline=[
+            dict(
+                type='LoadImageFromFile',
+                file_client_args=dict(backend='disk')),
+            dict(type='Resize', scale=(640, 640), keep_ratio=True),
+            dict(
+                type='Pad', size=(640, 640),
+                pad_val=dict(img=(114, 114, 114))),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ],
+        type='CocoDataset',
+        metainfo=dict(classes='text_line', palette=[(220, 20, 60)]),
+        data_prefix=dict(
+            img=
+            '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/'
+        ),
+        ann_file=
+        '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions2.json',
+        test_mode=True),
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False))
+max_epochs = 12
+stage2_num_epochs = 2
+base_lr = 0.00025
+interval = 12
+val_evaluator = dict(
+    proposal_nums=(100, 1, 10),
+    metric=['bbox', 'segm'],
+    type='CocoMetric',
+    ann_file=
+    '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json'
+)
+test_evaluator = dict(
+    proposal_nums=(100, 1, 10),
+    metric=['bbox', 'segm'],
+    type='CocoMetric',
+    ann_file=
+    '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json'
+)
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=10,
+        switch_pipeline=[
+            dict(
+                type='LoadImageFromFile',
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='LoadAnnotations',
+                with_bbox=True,
+                with_mask=True,
+                poly2mask=False),
+            dict(
+                type='RandomResize',
+                scale=(640, 640),
+                ratio_range=(0.1, 2.0),
+                keep_ratio=True),
+            dict(
+                type='RandomCrop',
+                crop_size=(640, 640),
+                recompute_bbox=True,
+                allow_negative_crop=True),
+            dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+            dict(type='YOLOXHSVRandomAug'),
+            dict(type='RandomFlip', prob=0.5),
+            dict(
+                type='Pad', size=(640, 640),
+                pad_val=dict(img=(114, 114, 114))),
+            dict(type='PackDetInputs')
+        ])
+]
+work_dir = '/home/erik/Riksarkivet/Projects/HTR_Pipeline/models/checkpoints/rtmdet_lines_pr_2'
+train_batch_size_per_gpu = 2
+val_batch_size_per_gpu = 1
+train_num_workers = 1
+num_classes = 1
+metainfo = dict(classes='text_line', palette=[(220, 20, 60)])
+icdar_2019 = dict(
+    type='CocoDataset',
+    metainfo=dict(classes='text_line', palette=[(220, 20, 60)]),
+    data_prefix=dict(
+        img=
+        '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/'
+    ),
+    ann_file=
+    '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_regions2.json',
+    pipeline=[
+        dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+        dict(
+            type='LoadAnnotations',
+            with_bbox=True,
+            with_mask=True,
+            poly2mask=False),
+        dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0),
+        dict(
+            type='RandomResize',
+            scale=(1280, 1280),
+            ratio_range=(0.1, 2.0),
+            keep_ratio=True),
+        dict(
+            type='RandomCrop',
+            crop_size=(640, 640),
+            recompute_bbox=True,
+            allow_negative_crop=True),
+        dict(type='YOLOXHSVRandomAug'),
+        dict(type='RandomFlip', prob=0.5),
+        dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+        dict(
+            type='CachedMixUp',
+            img_scale=(640, 640),
+            ratio_range=(1.0, 1.0),
+            max_cached_images=20,
+            pad_val=(114, 114, 114)),
+        dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+        dict(type='PackDetInputs')
+    ])
+icdar_2019_test = dict(
+    type='CocoDataset',
+    metainfo=dict(classes='text_line', palette=[(220, 20, 60)]),
+    data_prefix=dict(
+        img=
+        '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/'
+    ),
+    ann_file=
+    '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_lines.json',
+    test_mode=True,
+    pipeline=[
+        dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+        dict(type='Resize', scale=(640, 640), keep_ratio=True),
+        dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+        dict(
+            type='PackDetInputs',
+            meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                       'scale_factor'))
+    ])
+police_records = dict(
+    type='CocoDataset',
+    metainfo=dict(classes='text_line', palette=[(220, 20, 60)]),
+    data_prefix=dict(
+        img=
+        '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/'
+    ),
+    ann_file=
+    '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json',
+    pipeline=[
+        dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+        dict(
+            type='LoadAnnotations',
+            with_bbox=True,
+            with_mask=True,
+            poly2mask=False),
+        dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0),
+        dict(
+            type='RandomResize',
+            scale=(1280, 1280),
+            ratio_range=(0.1, 2.0),
+            keep_ratio=True),
+        dict(
+            type='RandomCrop',
+            crop_size=(640, 640),
+            recompute_bbox=True,
+            allow_negative_crop=True),
+        dict(type='YOLOXHSVRandomAug'),
+        dict(type='RandomFlip', prob=0.5),
+        dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+        dict(
+            type='CachedMixUp',
+            img_scale=(640, 640),
+            ratio_range=(1.0, 1.0),
+            max_cached_images=20,
+            pad_val=(114, 114, 114)),
+        dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+        dict(type='PackDetInputs')
+    ])
+train_list = [
+    dict(
+        type='CocoDataset',
+        metainfo=dict(classes='text_line', palette=[(220, 20, 60)]),
+        data_prefix=dict(
+            img=
+            '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/'
+        ),
+        ann_file=
+        '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json',
+        pipeline=[
+            dict(
+                type='LoadImageFromFile',
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='LoadAnnotations',
+                with_bbox=True,
+                with_mask=True,
+                poly2mask=False),
+            dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0),
+            dict(
+                type='RandomResize',
+                scale=(1280, 1280),
+                ratio_range=(0.1, 2.0),
+                keep_ratio=True),
+            dict(
+                type='RandomCrop',
+                crop_size=(640, 640),
+                recompute_bbox=True,
+                allow_negative_crop=True),
+            dict(type='YOLOXHSVRandomAug'),
+            dict(type='RandomFlip', prob=0.5),
+            dict(
+                type='Pad', size=(640, 640),
+                pad_val=dict(img=(114, 114, 114))),
+            dict(
+                type='CachedMixUp',
+                img_scale=(640, 640),
+                ratio_range=(1.0, 1.0),
+                max_cached_images=20,
+                pad_val=(114, 114, 114)),
+            dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+            dict(type='PackDetInputs')
+        ])
+]
+test_list = [
+    dict(
+        type='CocoDataset',
+        metainfo=dict(classes='text_line', palette=[(220, 20, 60)]),
+        data_prefix=dict(
+            img=
+            '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/'
+        ),
+        ann_file=
+        '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_lines.json',
+        test_mode=True,
+        pipeline=[
+            dict(
+                type='LoadImageFromFile',
+                file_client_args=dict(backend='disk')),
+            dict(type='Resize', scale=(640, 640), keep_ratio=True),
+            dict(
+                type='Pad', size=(640, 640),
+                pad_val=dict(img=(114, 114, 114))),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ])
+]
+pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='Resize', scale=(640, 640), keep_ratio=True),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+launcher = 'pytorch'

models/RmtDet_regions/rtmdet_m_textregions_2_concat.py ADDED Viewed

	@@ -0,0 +1,380 @@

+default_scope = "mmdet"
+default_hooks = dict(
+    timer=dict(type="IterTimerHook"),
+    logger=dict(type="LoggerHook", interval=100),
+    param_scheduler=dict(type="ParamSchedulerHook"),
+    checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=5, save_best="auto"),
+    sampler_seed=dict(type="DistSamplerSeedHook"),
+    visualization=dict(type="DetVisualizationHook"),
+)
+env_cfg = dict(cudnn_benchmark=False, mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0), dist_cfg=dict(backend="nccl"))
+vis_backends = [dict(type="LocalVisBackend")]
+visualizer = dict(type="DetLocalVisualizer", vis_backends=[dict(type="LocalVisBackend")], name="visualizer", save_dir="./")
+log_processor = dict(type="LogProcessor", window_size=50, by_epoch=True)
+log_level = "INFO"
+load_from = "/home/erik/Riksarkivet/Projects/HTR_Pipeline/models/checkpoints/rtmdet_regions_6/epoch_11.pth"
+resume = True
+train_cfg = dict(type="EpochBasedTrainLoop", max_epochs=12, val_interval=12, dynamic_intervals=[(10, 1)])
+val_cfg = dict(type="ValLoop")
+test_cfg = dict(
+    type="TestLoop",
+    pipeline=[
+        dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+        dict(type="Resize", scale=(640, 640), keep_ratio=True),
+        dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+        dict(type="PackDetInputs", meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor")),
+    ],
+)
+param_scheduler = [
+    dict(type="LinearLR", start_factor=1e-05, by_epoch=False, begin=0, end=1000),
+    dict(type="CosineAnnealingLR", eta_min=1.25e-05, begin=6, end=12, T_max=6, by_epoch=True, convert_to_iter_based=True),
+]
+optim_wrapper = dict(
+    type="OptimWrapper",
+    optimizer=dict(type="AdamW", lr=0.00025, weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True),
+)
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+dataset_type = "CocoDataset"
+data_root = "data/coco/"
+file_client_args = dict(backend="disk")
+train_pipeline = [
+    dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+    dict(type="LoadAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
+    dict(type="CachedMosaic", img_scale=(640, 640), pad_val=114.0),
+    dict(type="RandomResize", scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True),
+    dict(type="RandomCrop", crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True),
+    dict(type="YOLOXHSVRandomAug"),
+    dict(type="RandomFlip", prob=0.5),
+    dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type="CachedMixUp", img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)),
+    dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1)),
+    dict(type="PackDetInputs"),
+]
+test_pipeline = [
+    dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+    dict(type="Resize", scale=(640, 640), keep_ratio=True),
+    dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type="PackDetInputs", meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor")),
+]
+tta_model = dict(type="DetTTAModel", tta_cfg=dict(nms=dict(type="nms", iou_threshold=0.6), max_per_img=100))
+img_scales = [(640, 640), (320, 320), (960, 960)]
+tta_pipeline = [
+    dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+    dict(
+        type="TestTimeAug",
+        transforms=[
+            [
+                {"type": "Resize", "scale": (640, 640), "keep_ratio": True},
+                {"type": "Resize", "scale": (320, 320), "keep_ratio": True},
+                {"type": "Resize", "scale": (960, 960), "keep_ratio": True},
+            ],
+            [{"type": "RandomFlip", "prob": 1.0}, {"type": "RandomFlip", "prob": 0.0}],
+            [{"type": "Pad", "size": (960, 960), "pad_val": {"img": (114, 114, 114)}}],
+            [
+                {
+                    "type": "PackDetInputs",
+                    "meta_keys": ("img_id", "img_path", "ori_shape", "img_shape", "scale_factor", "flip", "flip_direction"),
+                }
+            ],
+        ],
+    ),
+]
+model = dict(
+    type="RTMDet",
+    data_preprocessor=dict(
+        type="DetDataPreprocessor", mean=[103.53, 116.28, 123.675], std=[57.375, 57.12, 58.395], bgr_to_rgb=False, batch_augments=None
+    ),
+    backbone=dict(
+        type="CSPNeXt",
+        arch="P5",
+        expand_ratio=0.5,
+        deepen_factor=0.67,
+        widen_factor=0.75,
+        channel_attention=True,
+        norm_cfg=dict(type="SyncBN"),
+        act_cfg=dict(type="SiLU", inplace=True),
+    ),
+    neck=dict(
+        type="CSPNeXtPAFPN",
+        in_channels=[192, 384, 768],
+        out_channels=192,
+        num_csp_blocks=2,
+        expand_ratio=0.5,
+        norm_cfg=dict(type="SyncBN"),
+        act_cfg=dict(type="SiLU", inplace=True),
+    ),
+    bbox_head=dict(
+        type="RTMDetInsSepBNHead",
+        num_classes=80,
+        in_channels=192,
+        stacked_convs=2,
+        share_conv=True,
+        pred_kernel_size=1,
+        feat_channels=192,
+        act_cfg=dict(type="SiLU", inplace=True),
+        norm_cfg=dict(type="SyncBN", requires_grad=True),
+        anchor_generator=dict(type="MlvlPointGenerator", offset=0, strides=[8, 16, 32]),
+        bbox_coder=dict(type="DistancePointBBoxCoder"),
+        loss_cls=dict(type="QualityFocalLoss", use_sigmoid=True, beta=2.0, loss_weight=1.0),
+        loss_bbox=dict(type="GIoULoss", loss_weight=2.0),
+        loss_mask=dict(type="DiceLoss", loss_weight=2.0, eps=5e-06, reduction="mean"),
+    ),
+    train_cfg=dict(assigner=dict(type="DynamicSoftLabelAssigner", topk=13), allowed_border=-1, pos_weight=-1, debug=False),
+    test_cfg=dict(nms_pre=200, min_bbox_size=0, score_thr=0.4, nms=dict(type="nms", iou_threshold=0.6), max_per_img=50, mask_thr_binary=0.5),
+)
+train_pipeline_stage2 = [
+    dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+    dict(type="LoadAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
+    dict(type="RandomResize", scale=(640, 640), ratio_range=(0.1, 2.0), keep_ratio=True),
+    dict(type="RandomCrop", crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True),
+    dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1)),
+    dict(type="YOLOXHSVRandomAug"),
+    dict(type="RandomFlip", prob=0.5),
+    dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type="PackDetInputs"),
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=1,
+    batch_sampler=None,
+    pin_memory=True,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    dataset=dict(
+        type="ConcatDataset",
+        datasets=[
+            dict(
+                type="CocoDataset",
+                metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+                data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/"),
+                ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions2.json",
+                pipeline=[
+                    dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+                    dict(type="LoadAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
+                    dict(type="CachedMosaic", img_scale=(640, 640), pad_val=114.0),
+                    dict(type="RandomResize", scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True),
+                    dict(type="RandomCrop", crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True),
+                    dict(type="YOLOXHSVRandomAug"),
+                    dict(type="RandomFlip", prob=0.5),
+                    dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+                    dict(type="CachedMixUp", img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)),
+                    dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1)),
+                    dict(type="PackDetInputs"),
+                ],
+            ),
+            dict(
+                type="CocoDataset",
+                metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+                data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/"),
+                ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_regions2.json",
+                pipeline=[
+                    dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+                    dict(type="LoadAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
+                    dict(type="CachedMosaic", img_scale=(640, 640), pad_val=114.0),
+                    dict(type="RandomResize", scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True),
+                    dict(type="RandomCrop", crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True),
+                    dict(type="YOLOXHSVRandomAug"),
+                    dict(type="RandomFlip", prob=0.5),
+                    dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+                    dict(type="CachedMixUp", img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)),
+                    dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1)),
+                    dict(type="PackDetInputs"),
+                ],
+            ),
+        ],
+    ),
+)
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=10,
+    dataset=dict(
+        pipeline=[
+            dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+            dict(type="Resize", scale=(640, 640), keep_ratio=True),
+            dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+            dict(type="PackDetInputs", meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor")),
+        ],
+        type="CocoDataset",
+        metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+        data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/"),
+        ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions2.json",
+        test_mode=True,
+    ),
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+)
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=10,
+    dataset=dict(
+        pipeline=[
+            dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+            dict(type="Resize", scale=(640, 640), keep_ratio=True),
+            dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+            dict(type="PackDetInputs", meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor")),
+        ],
+        type="CocoDataset",
+        metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+        data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/"),
+        ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions2.json",
+        test_mode=True,
+    ),
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+)
+max_epochs = 12
+stage2_num_epochs = 2
+base_lr = 0.00025
+interval = 12
+val_evaluator = dict(
+    proposal_nums=(100, 1, 10),
+    metric=["bbox", "segm"],
+    type="CocoMetric",
+    ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_regions2.json",
+)
+test_evaluator = dict(
+    proposal_nums=(100, 1, 10),
+    metric=["bbox", "segm"],
+    type="CocoMetric",
+    ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_regions2.json",
+)
+custom_hooks = [
+    dict(type="EMAHook", ema_type="ExpMomentumEMA", momentum=0.0002, update_buffers=True, priority=49),
+    dict(
+        type="PipelineSwitchHook",
+        switch_epoch=10,
+        switch_pipeline=[
+            dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+            dict(type="LoadAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
+            dict(type="RandomResize", scale=(640, 640), ratio_range=(0.1, 2.0), keep_ratio=True),
+            dict(type="RandomCrop", crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True),
+            dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1)),
+            dict(type="YOLOXHSVRandomAug"),
+            dict(type="RandomFlip", prob=0.5),
+            dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+            dict(type="PackDetInputs"),
+        ],
+    ),
+]
+work_dir = "/home/erik/Riksarkivet/Projects/HTR_Pipeline/models/checkpoints/rtmdet_regions_6"
+train_batch_size_per_gpu = 2
+val_batch_size_per_gpu = 1
+train_num_workers = 1
+num_classes = 1
+metainfo = dict(classes="TextRegion", palette=[(220, 20, 60)])
+icdar_2019 = dict(
+    type="CocoDataset",
+    metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+    data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/"),
+    ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_regions2.json",
+    pipeline=[
+        dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+        dict(type="LoadAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
+        dict(type="CachedMosaic", img_scale=(640, 640), pad_val=114.0),
+        dict(type="RandomResize", scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True),
+        dict(type="RandomCrop", crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True),
+        dict(type="YOLOXHSVRandomAug"),
+        dict(type="RandomFlip", prob=0.5),
+        dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+        dict(type="CachedMixUp", img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)),
+        dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1)),
+        dict(type="PackDetInputs"),
+    ],
+)
+icdar_2019_test = dict(
+    type="CocoDataset",
+    metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+    data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/"),
+    ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_regions2.json",
+    test_mode=True,
+    pipeline=[
+        dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+        dict(type="Resize", scale=(640, 640), keep_ratio=True),
+        dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+        dict(type="PackDetInputs", meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor")),
+    ],
+)
+police_records = dict(
+    type="CocoDataset",
+    metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+    data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/"),
+    ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions2.json",
+    pipeline=[
+        dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+        dict(type="LoadAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
+        dict(type="CachedMosaic", img_scale=(640, 640), pad_val=114.0),
+        dict(type="RandomResize", scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True),
+        dict(type="RandomCrop", crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True),
+        dict(type="YOLOXHSVRandomAug"),
+        dict(type="RandomFlip", prob=0.5),
+        dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+        dict(type="CachedMixUp", img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)),
+        dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1)),
+        dict(type="PackDetInputs"),
+    ],
+)
+train_list = [
+    dict(
+        type="CocoDataset",
+        metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+        data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/"),
+        ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions2.json",
+        pipeline=[
+            dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+            dict(type="LoadAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
+            dict(type="CachedMosaic", img_scale=(640, 640), pad_val=114.0),
+            dict(type="RandomResize", scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True),
+            dict(type="RandomCrop", crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True),
+            dict(type="YOLOXHSVRandomAug"),
+            dict(type="RandomFlip", prob=0.5),
+            dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+            dict(type="CachedMixUp", img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)),
+            dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1)),
+            dict(type="PackDetInputs"),
+        ],
+    ),
+    dict(
+        type="CocoDataset",
+        metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+        data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/"),
+        ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_regions2.json",
+        pipeline=[
+            dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+            dict(type="LoadAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
+            dict(type="CachedMosaic", img_scale=(640, 640), pad_val=114.0),
+            dict(type="RandomResize", scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True),
+            dict(type="RandomCrop", crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True),
+            dict(type="YOLOXHSVRandomAug"),
+            dict(type="RandomFlip", prob=0.5),
+            dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+            dict(type="CachedMixUp", img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)),
+            dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1)),
+            dict(type="PackDetInputs"),
+        ],
+    ),
+]
+test_list = [
+    dict(
+        type="CocoDataset",
+        metainfo=dict(classes="TextRegion", palette=[(220, 20, 60)]),
+        data_prefix=dict(img="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/"),
+        ann_file="/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_regions2.json",
+        test_mode=True,
+        pipeline=[
+            dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+            dict(type="Resize", scale=(640, 640), keep_ratio=True),
+            dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+            dict(type="PackDetInputs", meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor")),
+        ],
+    )
+]
+pipeline = [
+    dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+    dict(type="Resize", scale=(640, 640), keep_ratio=True),
+    dict(type="Pad", size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type="PackDetInputs", meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor")),
+]
+launcher = "pytorch"

models/SATRN/_base_satrn_shallow_concat.py ADDED Viewed

	@@ -0,0 +1,318 @@

+default_scope = "mmocr"
+env_cfg = dict(
+    cudnn_benchmark=True, mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0), dist_cfg=dict(backend="nccl")
+)
+randomness = dict(seed=None)
+default_hooks = dict(
+    timer=dict(type="IterTimerHook"),
+    logger=dict(type="LoggerHook", interval=100),
+    param_scheduler=dict(type="ParamSchedulerHook"),
+    checkpoint=dict(type="CheckpointHook", interval=1),
+    sampler_seed=dict(type="DistSamplerSeedHook"),
+    sync_buffer=dict(type="SyncBuffersHook"),
+    visualization=dict(type="VisualizationHook", interval=1, enable=False, show=False, draw_gt=False, draw_pred=False),
+)
+log_level = "INFO"
+log_processor = dict(type="LogProcessor", window_size=10, by_epoch=True)
+load_from = (
+    "/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/models/checkpoints/1700_1800_combined_satrn/epoch_5.pth"
+)
+resume = False
+val_evaluator = dict(
+    type="Evaluator",
+    metrics=[
+        dict(
+            type="WordMetric",
+            mode=["exact", "ignore_case", "ignore_case_symbol"],
+            valid_symbol="[^A-Z^a-z^0-9^一-龥^å^ä^ö^Å^Ä^Ö]",
+        ),
+        dict(type="CharMetric", valid_symbol="[^A-Z^a-z^0-9^一-龥^å^ä^ö^Å^Ä^Ö]"),
+        dict(type="OneMinusNEDMetric", valid_symbol="[^A-Z^a-z^0-9^一-龥^å^ä^ö^Å^Ä^Ö]"),
+    ],
+)
+test_evaluator = dict(
+    type="Evaluator",
+    metrics=[
+        dict(
+            type="WordMetric",
+            mode=["exact", "ignore_case", "ignore_case_symbol"],
+            valid_symbol="[^A-Z^a-z^0-9^一-龥^å^ä^ö^Å^Ä^Ö]",
+        ),
+        dict(type="CharMetric", valid_symbol="[^A-Z^a-z^0-9^一-龥^å^ä^ö^Å^Ä^Ö]"),
+        dict(type="OneMinusNEDMetric", valid_symbol="[^A-Z^a-z^0-9^一-龥^å^ä^ö^Å^Ä^Ö]"),
+    ],
+)
+vis_backends = [dict(type="LocalVisBackend")]
+visualizer = dict(type="TextRecogLocalVisualizer", name="visualizer", vis_backends=[dict(type="TensorboardVisBackend")])
+optim_wrapper = dict(type="OptimWrapper", optimizer=dict(type="Adam", lr=0.0003))
+train_cfg = dict(type="EpochBasedTrainLoop", max_epochs=5, val_interval=1)
+val_cfg = dict(type="ValLoop")
+test_cfg = dict(type="TestLoop")
+param_scheduler = [dict(type="MultiStepLR", milestones=[3, 4], end=5)]
+file_client_args = dict(backend="disk")
+dictionary = dict(
+    type="Dictionary",
+    dict_file="./models/SATRN/dict1700.txt",
+    with_padding=True,
+    with_unknown=True,
+    same_start_end=True,
+    with_start=True,
+    with_end=True,
+)
+model = dict(
+    type="SATRN",
+    backbone=dict(type="ShallowCNN", input_channels=3, hidden_dim=512),
+    encoder=dict(
+        type="SATRNEncoder",
+        n_layers=12,
+        n_head=8,
+        d_k=64,
+        d_v=64,
+        d_model=512,
+        n_position=100,
+        d_inner=2048,
+        dropout=0.1,
+    ),
+    decoder=dict(
+        type="NRTRDecoder",
+        n_layers=6,
+        d_embedding=512,
+        n_head=8,
+        d_model=512,
+        d_inner=2048,
+        d_k=64,
+        d_v=64,
+        module_loss=dict(type="CEModuleLoss", flatten=True, ignore_first_char=True),
+        dictionary=dict(
+            type="Dictionary",
+            dict_file="./models/SATRN/dict1700.txt",
+            with_padding=True,
+            with_unknown=True,
+            same_start_end=True,
+            with_start=True,
+            with_end=True,
+        ),
+        max_seq_len=100,
+        postprocessor=dict(type="AttentionPostprocessor"),
+    ),
+    data_preprocessor=dict(
+        type="TextRecogDataPreprocessor", mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]
+    ),
+)
+train_pipeline = [
+    dict(type="LoadImageFromFile", file_client_args=dict(backend="disk"), ignore_empty=True, min_size=2),
+    dict(type="LoadOCRAnnotations", with_text=True),
+    dict(type="Resize", scale=(400, 64), keep_ratio=False),
+    dict(type="PackTextRecogInputs", meta_keys=("img_path", "ori_shape", "img_shape", "valid_ratio")),
+]
+test_pipeline = [
+    dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+    dict(type="Resize", scale=(400, 64), keep_ratio=False),
+    dict(type="LoadOCRAnnotations", with_text=True),
+    dict(type="PackTextRecogInputs", meta_keys=("img_path", "ori_shape", "img_shape", "valid_ratio")),
+]
+HTR_1700_combined_train = dict(
+    type="RecogTextDataset",
+    parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+    data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_clean",
+    ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_HTR_shuffled_train.jsonl",
+    test_mode=False,
+    pipeline=None,
+)
+HTR_1700_combined_test = dict(
+    type="RecogTextDataset",
+    parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+    data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_clean",
+    ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_HTR_shuffled_val.jsonl",
+    test_mode=True,
+    pipeline=None,
+)
+pr_cr_combined_train = dict(
+    type="RecogTextDataset",
+    parser_cfg=dict(type="LineStrParser", keys=["filename", "text"], separator="|"),
+    data_root="/ceph/hpc/scratch/user/euerikl/data/line_images",
+    ann_file="/ceph/hpc/home/euerikl/projects/htr_1800/gt_files/combined_train.txt",
+    test_mode=False,
+    pipeline=None,
+)
+pr_cr_combined_test = dict(
+    type="RecogTextDataset",
+    parser_cfg=dict(type="LineStrParser", keys=["filename", "text"], separator="|"),
+    data_root="/ceph/hpc/scratch/user/euerikl/data/line_images",
+    ann_file="/ceph/hpc/home/euerikl/projects/htr_1800/gt_files/combined_eval.txt",
+    test_mode=True,
+    pipeline=None,
+)
+out_of_domain_1700_all_test = dict(
+    type="RecogTextDataset",
+    parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+    data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_testsets_clean",
+    ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_testsets_gt/1700_HTR_testsets_all.jsonl",
+    test_mode=True,
+    pipeline=None,
+)
+train_list = [
+    dict(
+        type="RecogTextDataset",
+        parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+        data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_clean",
+        ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_HTR_shuffled_train.jsonl",
+        test_mode=False,
+        pipeline=None,
+    ),
+    dict(
+        type="RecogTextDataset",
+        parser_cfg=dict(type="LineStrParser", keys=["filename", "text"], separator="|"),
+        data_root="/ceph/hpc/scratch/user/euerikl/data/line_images",
+        ann_file="/ceph/hpc/home/euerikl/projects/htr_1800/gt_files/combined_train.txt",
+        test_mode=False,
+        pipeline=None,
+    ),
+]
+test_list = [
+    dict(
+        type="RecogTextDataset",
+        parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+        data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_testsets_clean",
+        ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_testsets_gt/1700_HTR_testsets_all.jsonl",
+        test_mode=True,
+        pipeline=None,
+    )
+]
+train_dataset = dict(
+    type="ConcatDataset",
+    datasets=[
+        dict(
+            type="RecogTextDataset",
+            parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+            data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_clean",
+            ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_HTR_shuffled_train.jsonl",
+            test_mode=False,
+            pipeline=None,
+        ),
+        dict(
+            type="RecogTextDataset",
+            parser_cfg=dict(type="LineStrParser", keys=["filename", "text"], separator="|"),
+            data_root="/ceph/hpc/scratch/user/euerikl/data/line_images",
+            ann_file="/ceph/hpc/home/euerikl/projects/htr_1800/gt_files/combined_train.txt",
+            test_mode=False,
+            pipeline=None,
+        ),
+    ],
+    pipeline=[
+        dict(type="LoadImageFromFile", file_client_args=dict(backend="disk"), ignore_empty=True, min_size=2),
+        dict(type="LoadOCRAnnotations", with_text=True),
+        dict(type="Resize", scale=(400, 64), keep_ratio=False),
+        dict(type="PackTextRecogInputs", meta_keys=("img_path", "ori_shape", "img_shape", "valid_ratio")),
+    ],
+)
+test_dataset = dict(
+    type="ConcatDataset",
+    datasets=[
+        dict(
+            type="RecogTextDataset",
+            parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+            data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_testsets_clean",
+            ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_testsets_gt/1700_HTR_testsets_all.jsonl",
+            test_mode=True,
+            pipeline=None,
+        )
+    ],
+    pipeline=[
+        dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+        dict(type="Resize", scale=(400, 64), keep_ratio=False),
+        dict(type="LoadOCRAnnotations", with_text=True),
+        dict(type="PackTextRecogInputs", meta_keys=("img_path", "ori_shape", "img_shape", "valid_ratio")),
+    ],
+)
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    dataset=dict(
+        type="ConcatDataset",
+        datasets=[
+            dict(
+                type="RecogTextDataset",
+                parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+                data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_clean",
+                ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_HTR_shuffled_train.jsonl",
+                test_mode=False,
+                pipeline=None,
+            ),
+            dict(
+                type="RecogTextDataset",
+                parser_cfg=dict(type="LineStrParser", keys=["filename", "text"], separator="|"),
+                data_root="/ceph/hpc/scratch/user/euerikl/data/line_images",
+                ann_file="/ceph/hpc/home/euerikl/projects/htr_1800/gt_files/combined_train.txt",
+                test_mode=False,
+                pipeline=None,
+            ),
+        ],
+        pipeline=[
+            dict(type="LoadImageFromFile", file_client_args=dict(backend="disk"), ignore_empty=True, min_size=2),
+            dict(type="LoadOCRAnnotations", with_text=True),
+            dict(type="Resize", scale=(400, 64), keep_ratio=False),
+            dict(type="PackTextRecogInputs", meta_keys=("img_path", "ori_shape", "img_shape", "valid_ratio")),
+        ],
+    ),
+)
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type="ConcatDataset",
+        datasets=[
+            dict(
+                type="RecogTextDataset",
+                parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+                data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_testsets_clean",
+                ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_testsets_gt/1700_HTR_testsets_all.jsonl",
+                test_mode=True,
+                pipeline=None,
+            )
+        ],
+        pipeline=[
+            dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+            dict(type="Resize", scale=(400, 64), keep_ratio=False),
+            dict(type="LoadOCRAnnotations", with_text=True),
+            dict(type="PackTextRecogInputs", meta_keys=("img_path", "ori_shape", "img_shape", "valid_ratio")),
+        ],
+    ),
+)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type="ConcatDataset",
+        datasets=[
+            dict(
+                type="RecogTextDataset",
+                parser_cfg=dict(type="LineJsonParser", keys=["filename", "text"]),
+                data_root="/ceph/hpc/scratch/user/euerikl/data/HTR_1700_testsets_clean",
+                ann_file="/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/data/processed/1700_testsets_gt/1700_HTR_testsets_all.jsonl",
+                test_mode=True,
+                pipeline=None,
+            )
+        ],
+        pipeline=[
+            dict(type="LoadImageFromFile", file_client_args=dict(backend="disk")),
+            dict(type="Resize", scale=(400, 64), keep_ratio=False),
+            dict(type="LoadOCRAnnotations", with_text=True),
+            dict(type="PackTextRecogInputs", meta_keys=("img_path", "ori_shape", "img_shape", "valid_ratio")),
+        ],
+    ),
+)
+gpu_ids = range(0, 4)
+cudnn_benchmark = True
+work_dir = "/ceph/hpc/home/euerikl/projects/hf_openmmlab_models/models/checkpoints/1700_1800_combined_satrn"
+checkpoint_config = dict(interval=1)
+auto_scale_lr = dict(base_batch_size=32)
+launcher = "pytorch"

models/SATRN/dict1700.txt ADDED Viewed

	@@ -0,0 +1,148 @@

+!
+"
+#
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+?
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+[
+\
+]
+_
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+~
+£
+§
+¨
+¬
+¼
+½
+¾
+Ä
+Å
+Ö
+Ü
+ß
+à
+á
+ä
+å
+æ
+ç
+è
+é
+ê
+ë
+ï
+ô
+ö
+ü
+ý
+ÿ
+œ
+ƒ
+̄
+̅
+Ψ
+β
+ӕ
+َ
+–
+—
+„
+…
+⁒
+⁓
+℔
+⅓
+⅔
+□
+☉
+☽
+☿
+♀
+♂
+♃
+♄
+⦂
+ꜳ
+🜍
+🜔

pyproject.toml ADDED Viewed

	@@ -0,0 +1,80 @@

+# [build-system]
+# requires = ["setuptools"]
+# build-backend = "setuptools.build_meta"
+# [project]
+# name = "htr_pipeline"
+# description = "The purpose of the project is to demo Riksarkivets HTR-pipeline"
+# requires-python = ">= 3.10"
+# version="0.0.0.dev1"
+# authors = [{ name = "The Swedish National Archives Face team "}]
+# license = { text = "MIT" }
+# dependencies = [
+#     "torch",
+#     "torchvision",
+#     "openmim",
+#     "gradio",
+#     "pandas",
+#     "numpy",
+#     "opencv-python-headless",
+#     "jinja2",
+#     "transformers",
+#     "huggingface_hub",
+#     "requests",
+# ]
+# # !mim install mmengine
+# # !mim install mmcv
+# # !mim install mmdet
+# # !mim install mmocr
+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = []
+# Allow autofix for all enabled rules (when `--fix`) is provided.
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+unfixable = []
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+]
+# Same as Black.
+line-length = 120
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+# Assume Python 3.10.
+target-version = "py310"
+[tool.ruff.mccabe]
+# Unlike Flake8, default to a complexity level of 10.
+max-complexity = 10
+[tool.black]
+line-length = 120

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# !pip install -U openmim
+# !mim install mmengine
+# !mim install mmcv
+# !mim install mmdet
+# !mim install mmocr
+torch
+torchvision
+openmim
+gradio
+pandas
+numpy
+opencv-python-headless
+jinja2
+transformers
+huggingface_hub
+requests
+# scipy
+# sklearn

src/htr_pipeline/__init__.py ADDED Viewed

File without changes

src/htr_pipeline/gradio_backend.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import gradio as gr
+import pandas as pd
+from src.htr_pipeline.inferencer import Inferencer, InferencerInterface
+from src.htr_pipeline.pipeline import Pipeline, PipelineInterface
+class SingletonModelLoader:
+    _instance = None
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super(SingletonModelLoader, cls).__new__(cls, *args, **kwargs)
+        return cls._instance
+    def __init__(self):
+        self.inferencer = Inferencer(local_run=True)
+        self.pipeline = Pipeline(self.inferencer)
+# fast track
+class FastTrack:
+    def __init__(self, model_loader):
+        self.pipeline: PipelineInterface = model_loader.pipeline
+    def segment_to_xml(self, image, radio_button_choices):
+        xml_xml = "page_xml.xml"
+        xml_txt = "page_txt.txt"
+        if os.path.exists(f"./{xml_xml}"):
+            os.remove(f"./{xml_xml}")
+        rendered_xml = self.pipeline.running_htr_pipeline(image)
+        with open(xml_xml, "w") as f:
+            f.write(rendered_xml)
+        xml_img = self.visualize_xml_and_return_txt(image, xml_txt)
+        if radio_button_choices == "Text file":
+            returned_file_extension = xml_txt
+        else:
+            returned_file_extension = xml_xml
+        return xml_img, returned_file_extension, gr.update(visible=True)
+    def segment_to_xml_api(self, image):
+        rendered_xml = self.pipeline.running_htr_pipeline(image)
+        return rendered_xml
+    def visualize_xml_and_return_txt(self, img, xml_txt):
+        xml_img = self.pipeline.visualize_xml(img)
+        if os.path.exists(f"./{xml_txt}"):
+            os.remove(f"./{xml_txt}")
+        self.pipeline.parse_xml_to_txt()
+        return xml_img
+# Custom track
+class CustomTrack:
+    def __init__(self, model_loader):
+        self.inferencer: InferencerInterface = model_loader.inferencer
+    def region_segment(self, image, pred_score_threshold, containments_treshold):
+        predicted_regions, regions_cropped_ordered, _, _ = self.inferencer.predict_regions(
+            image, pred_score_threshold, containments_treshold
+        )
+        return predicted_regions, regions_cropped_ordered, gr.update(visible=False), gr.update(visible=True)
+    def line_segment(self, image, pred_score_threshold, containments_threshold):
+        predicted_lines, lines_cropped_ordered, _ = self.inferencer.predict_lines(
+            image, pred_score_threshold, containments_threshold
+        )
+        return (
+            predicted_lines,
+            image,
+            lines_cropped_ordered,
+            lines_cropped_ordered,  #
+            lines_cropped_ordered,  # temp_gallery
+            gr.update(visible=True),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=True),
+        )
+    def transcribe_text(self, df, images):
+        transcription_temp_list_with_score = []
+        mapping_dict = {}
+        for image in images:
+            transcribed_text, prediction_score_from_htr = self.inferencer.transcribe(image)
+            transcription_temp_list_with_score.append((transcribed_text, prediction_score_from_htr))
+            df_trans_explore = pd.DataFrame(
+                transcription_temp_list_with_score, columns=["Transcribed text", "HTR prediction score"]
+            )
+            mapping_dict[transcribed_text] = image
+            yield df_trans_explore[["Transcribed text"]], df_trans_explore, mapping_dict, gr.update(
+                visible=False
+            ), gr.update(visible=True), gr.update(visible=False)
+    def get_select_index_image(self, images_from_gallery, evt: gr.SelectData):
+        return images_from_gallery[evt.index]["name"]
+    def get_select_index_df(self, transcribed_text_df_finish, mapping_dict, evt: gr.SelectData):
+        df_list = transcribed_text_df_finish["Transcribed text"].tolist()
+        key_text = df_list[evt.index[0]]
+        sorted_image = mapping_dict[key_text]
+        new_first = [sorted_image]
+        new_list = [img for txt, img in mapping_dict.items() if txt != key_text]
+        new_first.extend(new_list)
+        return new_first
+    def download_df_to_txt(self, transcribed_df):
+        text_in_list = transcribed_df["Transcribed text"].tolist()
+        file_name = "./transcribed_text.txt"
+        text_file = open(file_name, "w")
+        for text in text_in_list:
+            text_file.write(text + "\n")
+        text_file.close()
+        return file_name, gr.update(visible=True)
+    # def transcribe_text_another_model(self, df, images):
+    #     transcription_temp_list = []
+    #     for image in images:
+    #         transcribed_text = inferencer.transcribe_different_model(image)
+    #         transcription_temp_list.append(transcribed_text)
+    #         df_trans = pd.DataFrame(transcription_temp_list, columns=["Transcribed_text"])
+    #         yield df_trans, df_trans, gr.update(visible=False)
+if __name__ == "__main__":
+    pass

src/htr_pipeline/inferencer.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from typing import Protocol, Tuple
+import gradio as gr
+import mmcv
+import numpy as np
+from src.htr_pipeline.models import HtrModels
+from src.htr_pipeline.utils.filter_segmask import FilterSegMask
+from src.htr_pipeline.utils.helper import timer_func
+from src.htr_pipeline.utils.order_of_object import OrderObject
+from src.htr_pipeline.utils.preprocess_img import Preprocess
+from src.htr_pipeline.utils.process_segmask import SegMaskHelper
+class Inferencer:
+    def __init__(self, local_run=False):
+        htr_models = HtrModels(local_run)
+        self.seg_model = htr_models.load_region_model()
+        self.line_model = htr_models.load_line_model()
+        self.htr_model_inferencer = htr_models.load_htr_model()
+        self.process_seg_mask = SegMaskHelper()
+        self.postprocess_seg_mask = FilterSegMask()
+        self.ordering = OrderObject()
+        self.preprocess_img = Preprocess()
+    @timer_func
+    def predict_regions(self, input_image, pred_score_threshold=0.5, containments_threshold=0.5, visualize=True):
+        input_image = self.preprocess_img.binarize_img(input_image)
+        image = mmcv.imread(input_image)
+        result = self.seg_model(image, return_datasample=True)
+        result_pred = result["predictions"][0]
+        filtered_result_pred = self.postprocess_seg_mask.filter_on_pred_threshold(
+            result_pred, pred_score_threshold=pred_score_threshold
+        )
+        if len(filtered_result_pred.pred_instances.masks) == 0:
+            raise gr.Error("No Regions were predicted by the model")
+        else:
+            result_align = self.process_seg_mask.align_masks_with_image(filtered_result_pred, image)
+            result_clean = self.postprocess_seg_mask.remove_overlapping_masks(
+                predicted_mask=result_align, containments_threshold=containments_threshold
+            )
+            if visualize:
+                result_viz = self.seg_model.visualize(
+                    inputs=[image], preds=[result_clean], return_vis=True, no_save_vis=True
+                )[0]
+            else:
+                result_viz = None
+            regions_cropped, polygons = self.process_seg_mask.crop_masks(result_clean, image)
+            order = self.ordering.order_regions_marginalia(result_clean)
+            regions_cropped_ordered = [regions_cropped[i] for i in order]
+            polygons_ordered = [polygons[i] for i in order]
+            masks_ordered = [result_clean.pred_instances.masks[i] for i in order]
+            return result_viz, regions_cropped_ordered, polygons_ordered, masks_ordered
+    @timer_func
+    def predict_lines(
+        self,
+        image,
+        pred_score_threshold=0.5,
+        containments_threshold=0.5,
+        line_spacing_factor=0.5,
+        visualize=True,
+        custom_track=True,
+    ):
+        result_tl = self.line_model(image, return_datasample=True)
+        result_tl_pred = result_tl["predictions"][0]
+        filtered_result_tl_pred = self.postprocess_seg_mask.filter_on_pred_threshold(
+            result_tl_pred, pred_score_threshold=pred_score_threshold
+        )
+        if len(filtered_result_tl_pred.pred_instances.masks) == 0 and custom_track:
+            raise gr.Error("No Lines were predicted by the model")
+        elif len(filtered_result_tl_pred.pred_instances.masks) == 0 and not custom_track:
+            return None, None, None
+        else:
+            result_tl_align = self.process_seg_mask.align_masks_with_image(filtered_result_tl_pred, image)
+            result_tl_clean = self.postprocess_seg_mask.remove_overlapping_masks(
+                predicted_mask=result_tl_align, containments_threshold=containments_threshold
+            )
+            if visualize:
+                result_viz = self.seg_model.visualize(
+                    inputs=[image], preds=[result_tl_clean], return_vis=True, no_save_vis=True
+                )[0]
+            else:
+                result_viz = None
+            lines_cropped, lines_polygons = self.process_seg_mask.crop_masks(result_tl_clean, image)
+            ordered_indices = self.ordering.order_lines(
+                line_image=result_tl_clean, line_spacing_factor=line_spacing_factor
+            )
+            lines_cropped_ordered = [lines_cropped[i] for i in ordered_indices]
+            lines_polygons_ordered = [lines_polygons[i] for i in ordered_indices]
+            return result_viz, lines_cropped_ordered, lines_polygons_ordered
+    @timer_func
+    def transcribe(self, line_cropped):
+        result_rec = self.htr_model_inferencer(line_cropped)
+        return result_rec["predictions"][0]["text"], result_rec["predictions"][0]["scores"]
+    # def transcribe_different_model(self, image):
+    #     processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
+    #     model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
+    #     # prepare image
+    #     pixel_values = processor(image, return_tensors="pt").pixel_values
+    #     # generate (no beam search)
+    #     generated_ids = model.generate(pixel_values)
+    #     # decode
+    #     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    #     return generated_text
+class InferencerInterface(Protocol):
+    def predict_regions(
+        self,
+        image: np.array,
+        pred_score_threshold: float,
+        containments_threshold: float,
+        visualize: bool = False,
+    ) -> Tuple:
+        ...
+    def predict_lines(
+        self,
+        text_region: np.array,
+        pred_score_threshold: float,
+        containments_threshold: float,
+        visualize: bool = False,
+        custom_track: bool = False,
+    ) -> Tuple:
+        ...
+    def transcribe(
+        self,
+        line: np.array,
+    ) -> Tuple[str, float]:
+        ...
+if __name__ == "__main__":
+    prediction_model = Inferencer()

src/htr_pipeline/models.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import torch
+from huggingface_hub import snapshot_download
+from mmdet.apis import DetInferencer
+# from mmengine import Config
+from mmocr.apis import TextRecInferencer
+class HtrModels:
+    def __init__(self, local_run=False):
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        SECRET_KEY = os.environ.get("AM_I_IN_A_DOCKER_CONTAINER", False)
+        model_folder = "./models"
+        self.region_config = f"{model_folder}/RmtDet_regions/rtmdet_m_textregions_2_concat.py"
+        self.line_config = f"{model_folder}/RmtDet_lines/rtmdet_m_textlines_2_concat.py"
+        self.line_checkpoint = f"{model_folder}/RmtDet_lines/epoch_12.pth"
+        self.mmocr_config = f"{model_folder}/SATRN/_base_satrn_shallow_concat.py"
+        if SECRET_KEY:
+            config_path = self.get_config()
+            self.region_checkpoint = config_path["region_checkpoint"]
+            self.line_checkpoint = config_path["line_checkpoint"]
+            self.mmocr_checkpoint = config_path["mmocr_checkpoint"]
+        else:
+            self.region_checkpoint = f"{model_folder}/RmtDet_regions/epoch_12.pth"
+            self.line_checkpoint = f"{model_folder}/RmtDet_lines/epoch_12.pth"
+            self.mmocr_checkpoint = f"{model_folder}/SATRN/epoch_5.pth"
+    def load_region_model(self):
+        # build the model from a config file and a checkpoint file
+        return DetInferencer(self.region_config, self.region_checkpoint, device=self.device)
+    def load_line_model(self):
+        return DetInferencer(self.line_config, self.line_checkpoint, device=self.device)
+    def load_htr_model(self):
+        inferencer = TextRecInferencer(self.mmocr_config, self.mmocr_checkpoint, device=self.device)
+        return inferencer
+    @staticmethod
+    def get_config():
+        path_models = snapshot_download(
+            "Riksarkivet/HTR_pipeline_models",
+            allow_patterns=["*.pth"],
+            token="__INSERT__FINS_HUGGINFACE_TOKEN__",
+            cache_dir="./",
+        )
+        config_path = {
+            "region_checkpoint": os.path.join(path_models, "RmtDet_regions/epoch_12.pth"),
+            "line_checkpoint": os.path.join(path_models, "RmtDet_lines/epoch_12.pth"),
+            "mmocr_checkpoint": os.path.join(path_models, "SATRN/epoch_5.pth"),
+        }
+        return config_path

src/htr_pipeline/pipeline.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from typing import Protocol
+import mmcv
+import numpy as np
+from htr_pipeline.utils.parser_xml import XmlParser
+from src.htr_pipeline.inferencer import Inferencer
+from src.htr_pipeline.utils.helper import timer_func
+from src.htr_pipeline.utils.preprocess_img import Preprocess
+from src.htr_pipeline.utils.process_xml import XMLHelper
+class Pipeline:
+    def __init__(self, inferencer: Inferencer) -> None:
+        self.inferencer = inferencer
+        self.xml = XMLHelper()
+        self.preprocess_img = Preprocess()
+    @timer_func
+    def running_htr_pipeline(
+        self,
+        input_image: np.ndarray,
+        pred_score_threshold_regions: float = 0.4,
+        pred_score_threshold_lines: float = 0.4,
+        containments_threshold: float = 0.5,
+    ) -> str:
+        input_image = self.preprocess_img.binarize_img(input_image)
+        image = mmcv.imread(input_image)
+        rendered_xml = self.xml.image_to_page_xml(
+            image, pred_score_threshold_regions, pred_score_threshold_lines, containments_threshold, self.inferencer
+        )
+        return rendered_xml
+    @timer_func
+    def visualize_xml(self, input_image: np.ndarray) -> np.ndarray:
+        self.xml_visualizer_and_parser = XmlParser()
+        bin_input_image = self.preprocess_img.binarize_img(input_image)
+        xml_image = self.xml_visualizer_and_parser.visualize_xml(bin_input_image)
+        return xml_image
+    @timer_func
+    def parse_xml_to_txt(self) -> None:
+        self.xml_visualizer_and_parser.xml_to_txt()
+class PipelineInterface(Protocol):
+    def __init__(self, inferencer: Inferencer) -> None:
+        ...
+    def running_htr_pipeline(
+        self,
+        input_image: np.ndarray,
+        pred_score_threshold_regions: float = 0.4,
+        pred_score_threshold_lines: float = 0.4,
+        containments_threshold: float = 0.5,
+    ) -> str:
+        ...
+    def visualize_xml(self, input_image: np.ndarray) -> np.ndarray:
+        ...
+    def parse_xml_to_txt(self) -> None:
+        ...
+if __name__ == "__main__":
+    prediction_model = Inferencer()
+    pipeline = Pipeline(prediction_model)

src/htr_pipeline/utils/__init__.py ADDED Viewed

File without changes

src/htr_pipeline/utils/filter_segmask.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import cv2
+import numpy as np
+import torch
+from mmdet.structures import DetDataSample
+from mmengine.structures import InstanceData
+class FilterSegMask:
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Removes smaller masks that are contained in a bigger mask
+    # @timer_func
+    def remove_overlapping_masks(self, predicted_mask, method="mask", containments_threshold=0.5):
+        # Convert masks to binary images
+        masks = [mask.cpu().numpy() for mask in predicted_mask.pred_instances.masks]
+        masks_binary = [(mask > 0).astype(np.uint8) for mask in masks]
+        masks_tensor = predicted_mask.pred_instances.masks
+        masks_tensor = [mask.to(self.device) for mask in masks_tensor]
+        # Compute bounding boxes and areas
+        boxes = [cv2.boundingRect(mask) for mask in masks_binary]
+        # Compute pairwise containment
+        containments = np.zeros((len(masks), len(masks)))
+        for i in range(len(masks)):
+            box_i = boxes[i]
+            for j in range(i + 1, len(masks)):
+                box_j = boxes[j]
+                if method == "mask":
+                    containment = self._calculate_containment_mask(masks_tensor[i], masks_tensor[j])
+                    containments[i, j] = containment
+                    containment = self._calculate_containment_mask(masks_tensor[j], masks_tensor[i])
+                    containments[j, i] = containment
+                elif method == "bbox":
+                    containment = self._calculate_containment_bbox(box_i, box_j)
+                    containments[i, j] = containment
+                    containment = self._calculate_containment_bbox(box_j, box_i)
+                    containments[j, i] = containment
+        # Keep only the biggest masks for overlapping pairs
+        keep_mask = np.ones(len(masks), dtype=np.bool_)
+        for i in range(len(masks)):
+            if not keep_mask[i]:
+                continue
+            if np.any(containments[i] > containments_threshold):
+                contained_indices = np.where(containments[i] > containments_threshold)[0]
+                for j in contained_indices:
+                    if np.count_nonzero(masks_binary[i]) >= np.count_nonzero(masks_binary[j]):
+                        keep_mask[j] = False
+                    else:
+                        keep_mask[i] = False
+        # Create a new DetDataSample with only selected instances
+        filtered_result = DetDataSample(metainfo=predicted_mask.metainfo)
+        pred_instances = InstanceData(metainfo=predicted_mask.metainfo)
+        masks = [mask for i, mask in enumerate(masks) if keep_mask[i]]
+        list_of_tensor_masks = [torch.from_numpy(mask) for mask in masks]
+        stacked_masks = torch.stack(list_of_tensor_masks)
+        updated_filtered_result = self._stacked_masks_update_data_sample(
+            filtered_result, stacked_masks, pred_instances, keep_mask, predicted_mask
+        )
+        return updated_filtered_result
+    def _stacked_masks_update_data_sample(self, filtered_result, stacked_masks, pred_instances, keep_mask, result):
+        pred_instances.masks = stacked_masks
+        pred_instances.bboxes = self._update_datasample_cat(result.pred_instances.bboxes.tolist(), keep_mask)
+        pred_instances.scores = self._update_datasample_cat(result.pred_instances.scores.tolist(), keep_mask)
+        pred_instances.kernels = self._update_datasample_cat(result.pred_instances.kernels.tolist(), keep_mask)
+        pred_instances.labels = self._update_datasample_cat(result.pred_instances.labels.tolist(), keep_mask)
+        pred_instances.priors = self._update_datasample_cat(result.pred_instances.priors.tolist(), keep_mask)
+        filtered_result.pred_instances = pred_instances
+        return filtered_result
+    def _calculate_containment_bbox(self, box_a, box_b):
+        xA = max(box_a[0], box_b[0])  # max x0
+        yA = max(box_a[1], box_b[1])  # max y0
+        xB = min(box_a[0] + box_a[2], box_b[0] + box_b[2])  # min x1
+        yB = min(box_a[1] + box_a[3], box_b[1] + box_b[3])  # min y1
+        box_a_area = box_a[2] * box_a[3]
+        box_b_area = box_b[2] * box_b[3]
+        intersection_area = max(0, xB - xA + 1) * max(0, yB - yA + 1)
+        containment = intersection_area / box_a_area if box_a_area > 0 else 0
+        return containment
+    def _calculate_containment_mask(self, mask_a, mask_b):
+        intersection = torch.logical_and(mask_a, mask_b).sum().float()
+        containment = intersection / mask_b.sum().float() if mask_b.sum() > 0 else 0
+        return containment
+    def _update_datasample_cat(self, cat_list, keep_mask):
+        cat_keep = [cat for i, cat in enumerate(cat_list) if keep_mask[i]]
+        tensor_cat_keep = torch.tensor(cat_keep)
+        return tensor_cat_keep
+    # @timer_func
+    def filter_on_pred_threshold(self, result_pred, pred_score_threshold=0.5):
+        id_list = []
+        for id, pred_score in enumerate(result_pred.pred_instances.scores):
+            if pred_score > pred_score_threshold:
+                id_list.append(id)
+        # Create a new DetDataSample with only selected instances
+        new_filtered_result = DetDataSample(metainfo=result_pred.metainfo)
+        new_pred_instances = InstanceData(metainfo=result_pred.metainfo)
+        new_pred_instances.masks = result_pred.pred_instances.masks[id_list]
+        new_pred_instances.bboxes = result_pred.pred_instances.bboxes[id_list]
+        new_pred_instances.scores = result_pred.pred_instances.scores[id_list]
+        new_pred_instances.kernels = result_pred.pred_instances.kernels[id_list]
+        new_pred_instances.labels = result_pred.pred_instances.labels[id_list]
+        new_pred_instances.priors = result_pred.pred_instances.priors[id_list]
+        new_filtered_result.pred_instances = new_pred_instances
+        return new_filtered_result
+        return new_filtered_result

src/htr_pipeline/utils/helper.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import functools
+import threading
+import time
+import tqdm
+def timer_func(func):
+    # This function shows the execution time of
+    # the function object passed
+    def wrap_func(*args, **kwargs):
+        t1 = time.time()
+        result = func(*args, **kwargs)
+        t2 = time.time()
+        print(f"Function {func.__name__!r} executed in {(t2-t1):.4f}s")
+        return result
+    return wrap_func
+def long_running_function(*args, **kwargs):
+    # print("Running with args:%s and kwargs:%s" % (args, kwargs))
+    time.sleep(5)
+    return "success"
+def provide_progress_bar(function, estimated_time, tstep=0.2, tqdm_kwargs={}, args=[], kwargs={}):
+    """Tqdm wrapper for a long-running function
+    args:
+        function - function to run
+        estimated_time - how long you expect the function to take
+        tstep - time delta (seconds) for progress bar updates
+        tqdm_kwargs - kwargs to construct the progress bar
+        args - args to pass to the function
+        kwargs - keyword args to pass to the function
+    ret:
+        function(*args, **kwargs)
+    """
+    ret = [None]  # Mutable var so the function can store its return value
+    def myrunner(function, ret, *args, **kwargs):
+        ret[0] = function(*args, **kwargs)
+    thread = threading.Thread(target=myrunner, args=(function, ret) + tuple(args), kwargs=kwargs)
+    pbar = tqdm.tqdm(total=estimated_time, **tqdm_kwargs)
+    thread.start()
+    while thread.is_alive():
+        thread.join(timeout=tstep)
+        pbar.update(tstep)
+    pbar.close()
+    return ret[0]
+def progress_wrapped(estimated_time, tstep=0.2, tqdm_kwargs={}):
+    """Decorate a function to add a progress bar"""
+    def real_decorator(function):
+        @functools.wraps(function)
+        def wrapper(*args, **kwargs):
+            return provide_progress_bar(
+                function, estimated_time=estimated_time, tstep=tstep, tqdm_kwargs=tqdm_kwargs, args=args, kwargs=kwargs
+            )
+        return wrapper
+    return real_decorator
+@progress_wrapped(estimated_time=5)
+def another_long_running_function(*args, **kwargs):
+    # print("Running with args:%s and kwargs:%s" % (args, kwargs))
+    time.sleep(5)
+    return "success"
+if __name__ == "__main__":
+    # Basic example
+    retval = provide_progress_bar(long_running_function, estimated_time=5)
+    print(retval)
+    # Full example
+    retval = provide_progress_bar(
+        long_running_function,
+        estimated_time=5,
+        tstep=1 / 5.0,
+        tqdm_kwargs={"bar_format": "{desc}: {percentage:3.0f}%|{bar}| {n:.1f}/{total:.1f} [{elapsed}<{remaining}]"},
+        args=(1, "foo"),
+        kwargs={"spam": "eggs"},
+    )
+    print(retval)
+    # Example of using the decorator
+    retval = another_long_running_function()
+    print(retval)
+    retval = another_long_running_function()
+    print(retval)
+    print(retval)

src/htr_pipeline/utils/order_of_object.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import numpy as np
+import pandas as pd
+class OrderObject:
+    def __init__(self):
+        pass
+    def order_lines(self, line_image, line_spacing_factor=0.5):
+        bounding_boxes = line_image.pred_instances.bboxes.tolist()
+        center_points = [(box[1] + box[3]) / 2 for box in bounding_boxes]
+        horizontal_positions = [(box[0] + box[2]) / 2 for box in bounding_boxes]
+        # Calculate the threshold distance
+        threshold_distance = self._calculate_threshold_distance(bounding_boxes, line_spacing_factor)
+        # Sort the indices based on vertical center points and horizontal positions
+        indices = list(range(len(bounding_boxes)))
+        indices.sort(
+            key=lambda i: (
+                center_points[i] // threshold_distance,
+                horizontal_positions[i],
+            )
+        )
+        # Order text lines
+        return indices
+    def _calculate_threshold_distance(self, bounding_boxes, line_spacing_factor=0.5):
+        # Calculate the average height of the text lines
+        total_height = sum(box[3] - box[1] for box in bounding_boxes)
+        average_height = total_height / len(bounding_boxes)
+        # Calculate the threshold distance, Set a factor for the threshold distance (adjust as needed)
+        threshold_distance = average_height * line_spacing_factor
+        # Return the threshold distance
+        return threshold_distance
+    def order_regions_marginalia(self, region_image, margin_ratio=0.2, histogram_bins=50, histogram_dip_ratio=0.5):
+        bounding_boxes = region_image.pred_instances.bboxes.tolist()
+        img_width = region_image.metainfo["ori_shape"][1]
+        regions = [[i, x[0], x[1], x[0] + x[2], x[1] + x[3]] for i, x in enumerate(bounding_boxes)]
+        # Create a pandas DataFrame from the regions
+        df = pd.DataFrame(regions, columns=["region_id", "x_min", "y_min", "x_max", "y_max"])
+        # Calculate the centroids of the bounding boxes
+        df["centroid_x"] = (df["x_min"] + df["x_max"]) / 2
+        df["centroid_y"] = (df["y_min"] + df["y_max"]) / 2
+        # Calculate a histogram of the x-coordinates of the centroids
+        histogram, bin_edges = np.histogram(df["centroid_x"], bins=histogram_bins)
+        # Determine if there's a significant dip in the histogram, which would suggest a two-page layout
+        is_two_pages = np.min(histogram) < np.max(histogram) * histogram_dip_ratio
+        if is_two_pages:
+            # Determine which page each region is on
+            page_width = int(img_width / 2)
+            df["page"] = (df["centroid_x"] > page_width).astype(int)
+            # Determine if the region is in the margin
+            margin_width = page_width * margin_ratio
+            df["is_margin"] = ((df["page"] == 0) & (df["centroid_x"] < margin_width)) | (
+                (df["page"] == 1) & (df["centroid_x"] > img_width - margin_width)
+            )
+        else:
+            df["page"] = 0
+            df["is_margin"] = (df["centroid_x"] < img_width * margin_ratio) | (
+                df["centroid_x"] > img_width - page_width * margin_ratio
+            )
+        # Define a custom sorting function
+        sort_regions = lambda row: (
+            row["page"],
+            row["is_margin"],
+            row["centroid_y"],
+            row["centroid_x"],
+        )
+        # Sort the DataFrame using the custom function
+        df["sort_key"] = df.apply(sort_regions, axis=1)
+        df = df.sort_values("sort_key")
+        # Return the ordered regions
+        return df["region_id"].tolist()

src/htr_pipeline/utils/parser_xml.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import math
+import os
+import random
+import xml.etree.ElementTree as ET
+from PIL import Image, ImageDraw, ImageFont
+class XmlParser:
+    def __init__(self, page_xml="./page_xml.xml"):
+        self.tree = ET.parse(page_xml, parser=ET.XMLParser(encoding="utf-8"))
+        self.root = self.tree.getroot()
+        self.namespace = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"
+    def visualize_xml(
+        self,
+        background_image,
+        font_size=9,
+        text_offset=10,
+        font_path_tff="./src/htr_pipeline/utils/templates/arial.ttf",
+    ):
+        image = Image.fromarray(background_image).convert("RGBA")
+        image_width = int(self.root.find(f"{self.namespace}Page").attrib["imageWidth"])
+        image_height = int(self.root.find(f"{self.namespace}Page").attrib["imageHeight"])
+        text_offset = -text_offset
+        base_font_size = font_size
+        font_path = font_path_tff
+        max_bbox_width = 0  # Initialize maximum bounding box width
+        for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
+            coords = textregion.find(f"{self.namespace}Coords").attrib["points"].split()
+            points = [tuple(map(int, point.split(","))) for point in coords]
+            x_coords, y_coords = zip(*points)
+            min_x, max_x = min(x_coords), max(x_coords)
+            bbox_width = max_x - min_x  # Width of the current bounding box
+            max_bbox_width = max(max_bbox_width, bbox_width)  # Update maximum bounding box width
+        scaling_factor = max_bbox_width / 400.0  # Use maximum bounding box width for scaling
+        font_size_scaled = int(base_font_size * scaling_factor)
+        font = ImageFont.truetype(font_path, font_size_scaled)
+        for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
+            fill_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 100)
+            for textline in textregion.findall(f".//{self.namespace}TextLine"):
+                coords = textline.find(f"{self.namespace}Coords").attrib["points"].split()
+                points = [tuple(map(int, point.split(","))) for point in coords]
+                poly_image = Image.new("RGBA", image.size)
+                poly_draw = ImageDraw.Draw(poly_image)
+                poly_draw.polygon(points, fill=fill_color)
+                text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text
+                x_coords, y_coords = zip(*points)
+                min_x, max_x = min(x_coords), max(x_coords)
+                min_y = min(y_coords)
+                text_width, text_height = poly_draw.textsize(text, font=font)  # Get text size
+                text_position = (
+                    (min_x + max_x) // 2 - text_width // 2,
+                    min_y + text_offset,
+                )  # Center text horizontally
+                poly_draw.text(text_position, text, fill=(0, 0, 0), font=font)
+                image = Image.alpha_composite(image, poly_image)
+        return image
+    def xml_to_txt(self, output_file="page_txt.txt"):
+        with open(output_file, "w", encoding="utf-8") as f:
+            for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
+                for textline in textregion.findall(f".//{self.namespace}TextLine"):
+                    text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text
+                    f.write(text + "\n")
+                f.write("\n")

src/htr_pipeline/utils/preprocess_img.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import cv2
+import numpy as np
+class Preprocess:
+    def __init__(self):
+        pass
+    def binarize_img(self, img):
+        # print(img)
+        # img_ori = cv2.imread(img)
+        img_ori = cv2.cvtColor(img.astype("uint8"), cv2.COLOR_RGB2BGR)
+        img_gray = cv2.cvtColor(img_ori, cv2.COLOR_BGR2GRAY)
+        dst = cv2.fastNlMeansDenoising(img_gray, h=31, templateWindowSize=7, searchWindowSize=21)
+        img_blur = cv2.medianBlur(dst, 3).astype("uint8")
+        threshed = cv2.adaptiveThreshold(img_blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+        img_gradio = cv2.cvtColor(threshed, cv2.COLOR_BGR2RGB)
+        return img_gradio

src/htr_pipeline/utils/process_segmask.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import cv2
+import numpy as np
+import torch
+from mmdet.registry import VISUALIZERS
+class SegMaskHelper:
+    def __init__(self):
+        pass
+    # Pad the masks to image size (bug in RTMDet config?)
+    # @timer_func
+    def align_masks_with_image(self, result, img):
+        masks = list()
+        img = img[..., ::-1].copy()
+        for j, mask in enumerate(result.pred_instances.masks):
+            numpy_mask = mask.cpu().numpy()
+            mask = cv2.resize(
+                numpy_mask.astype(np.uint8),
+                (img.shape[1], img.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Pad the mask to match the size of the image
+            padded_mask = np.zeros((img.shape[0], img.shape[1]), dtype=np.uint8)
+            padded_mask[: mask.shape[0], : mask.shape[1]] = mask
+            mask = padded_mask
+            mask = torch.from_numpy(mask)
+            masks.append(mask)
+        stacked_masks = torch.stack(masks)
+        result.pred_instances.masks = stacked_masks
+        return result
+    # Crops the images using masks and put the cropped images on a white background
+    # @timer_func
+    def crop_masks(self, result, img):
+        cropped_imgs = list()
+        polygons = list()
+        for j, mask in enumerate(result.pred_instances.masks):
+            np_array = mask.cpu().numpy()
+            contours, _ = cv2.findContours(
+                np_array.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
+            )  # fix so only one contour (the largest one) is extracted
+            largest_contour = max(contours, key=cv2.contourArea)
+            epsilon = 0.003 * cv2.arcLength(largest_contour, True)
+            approx_poly = cv2.approxPolyDP(largest_contour, epsilon, True)
+            approx_poly = np.squeeze(approx_poly)
+            approx_poly = approx_poly.tolist()
+            polygons.append(approx_poly)
+            x, y, w, h = cv2.boundingRect(largest_contour)
+            # Crop masked region and put on white background
+            masked_region = img[y : y + h, x : x + w]
+            white_background = np.ones_like(masked_region)
+            white_background.fill(255)
+            masked_region_on_white = cv2.bitwise_and(
+                white_background, masked_region, mask=np_array.astype(np.uint8)[y : y + h, x : x + w]
+            )
+            cv2.bitwise_not(white_background, white_background, mask=np_array.astype(np.uint8)[y : y + h, x : x + w])
+            res = white_background + masked_region_on_white
+            cropped_imgs.append(res)
+        return cropped_imgs, polygons
+    def visualize_result(self, result, img, model_visualizer):
+        visualizer = VISUALIZERS.build(model_visualizer)
+        visualizer.add_datasample("result", img, data_sample=result, draw_gt=False)
+        return visualizer.get_image()
+    def _translate_line_coords(self, region_mask, line_polygons):
+        region_mask = region_mask.cpu().numpy()
+        region_masks_binary = (region_mask > 0).astype(np.uint8)
+        box = cv2.boundingRect(region_masks_binary)
+        translated_line_polygons = [[[a + box[0], b + box[1]] for [a, b] in poly] for poly in line_polygons]
+        return translated_line_polygons

src/htr_pipeline/utils/process_xml.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import re
+from datetime import datetime
+import jinja2
+from tqdm import tqdm
+from src.htr_pipeline.inferencer import InferencerInterface
+from src.htr_pipeline.utils.process_segmask import SegMaskHelper
+class XMLHelper:
+    def __init__(self):
+        self.process_seg_mask = SegMaskHelper()
+    def image_to_page_xml(
+        self,
+        image,
+        pred_score_threshold_regions,
+        pred_score_threshold_lines,
+        containments_threshold,
+        inferencer: InferencerInterface,
+        xml_file_name="page_xml.xml",
+    ):
+        img_height = image.shape[0]
+        img_width = image.shape[1]
+        img_file_name = xml_file_name
+        template_data = self.prepare_template_data(img_file_name, img_width, img_height)
+        template_data["textRegions"] = self._process_regions(
+            image,
+            inferencer,
+            pred_score_threshold_regions,
+            pred_score_threshold_lines,
+            containments_threshold,
+        )
+        rendered_xml = self._render_xml(template_data)
+        return rendered_xml
+    def _transform_coords(self, input_string):
+        pattern = r"\[\s*([^\s,]+)\s*,\s*([^\s\]]+)\s*\]"
+        replacement = r"\1,\2"
+        return re.sub(pattern, replacement, input_string)
+    def _render_xml(self, template_data):
+        template_loader = jinja2.FileSystemLoader(searchpath="./src/htr_pipeline/utils/templates")
+        template_env = jinja2.Environment(loader=template_loader, trim_blocks=True)
+        template = template_env.get_template("page_xml_2013.xml")
+        rendered_xml = template.render(template_data)
+        rendered_xml = self._transform_coords(rendered_xml)
+        return rendered_xml
+    def prepare_template_data(self, img_file_name, img_width, img_height):
+        now = datetime.now()
+        date_time = now.strftime("%Y-%m-%d, %H:%M:%S")
+        return {
+            "created": date_time,
+            "imageFilename": img_file_name,
+            "imageWidth": img_width,
+            "imageHeight": img_height,
+            "textRegions": list(),
+        }
+    def _process_regions(
+        self,
+        image,
+        inferencer: InferencerInterface,
+        pred_score_threshold_regions,
+        pred_score_threshold_lines,
+        containments_threshold,
+        htr_threshold=0.7,
+    ):
+        _, regions_cropped_ordered, reg_polygons_ordered, reg_masks_ordered = inferencer.predict_regions(
+            image,
+            pred_score_threshold=pred_score_threshold_regions,
+            containments_threshold=containments_threshold,
+            visualize=False,
+        )
+        region_data_list = []
+        for i, (text_region, reg_pol, mask) in tqdm(
+            enumerate(zip(regions_cropped_ordered, reg_polygons_ordered, reg_masks_ordered))
+        ):
+            region_id = "region_" + str(i)
+            region_data = dict()
+            region_data["id"] = region_id
+            region_data["boundary"] = reg_pol
+            text_lines, htr_scores = self._process_lines(
+                text_region,
+                inferencer,
+                pred_score_threshold_lines,
+                containments_threshold,
+                mask,
+                region_id,
+            )
+            if text_lines is None:
+                continue
+            region_data["textLines"] = text_lines
+            mean_htr_score = sum(htr_scores) / len(htr_scores)
+            if mean_htr_score > htr_threshold:
+                region_data_list.append(region_data)
+        return region_data_list
+    def _process_lines(
+        self,
+        text_region,
+        inferencer: InferencerInterface,
+        pred_score_threshold_lines,
+        containments_threshold,
+        mask,
+        region_id,
+        htr_threshold=0.7,
+    ):
+        _, lines_cropped_ordered, line_polygons_ordered = inferencer.predict_lines(
+            text_region,
+            pred_score_threshold=pred_score_threshold_lines,
+            containments_threshold=containments_threshold,
+            visualize=False,
+            custom_track=False,
+        )
+        if lines_cropped_ordered is None:
+            return None, None
+        line_polygons_ordered_trans = self.process_seg_mask._translate_line_coords(mask, line_polygons_ordered)
+        htr_scores = list()
+        text_lines = list()
+        for j, (line, line_pol) in enumerate(zip(lines_cropped_ordered, line_polygons_ordered_trans)):
+            line_id = "line_" + region_id + "_" + str(j)
+            line_data = dict()
+            line_data["id"] = line_id
+            line_data["boundary"] = line_pol
+            line_data["unicode"], htr_score = inferencer.transcribe(line)
+            htr_scores.append(htr_score)
+            if htr_score > htr_threshold:
+                text_lines.append(line_data)
+        return text_lines, htr_scores

src/htr_pipeline/utils/templates/arial.ttf ADDED Viewed

Binary file (367 kB). View file

src/htr_pipeline/utils/templates/page_xml_2013.xml ADDED Viewed

	@@ -0,0 +1,30 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
+    <Metadata>
+        <Creator>Swedish National Archives</Creator>
+        <Created>{{ created }}</Created>
+    </Metadata>
+    <Page imageFilename="{{ imageFilename }}" imageWidth="{{ imageWidth }}" imageHeight="{{ imageHeight }}">
+{% for textRegion in textRegions %}
+    <TextRegion id="{{ textRegion.id }}" custom="readingOrder {index:{{ loop.index0 }};}">
+        <Coords points="{% for point in textRegion.boundary %}{{ point|join(',') }}{% if not loop.last %} {% endif %}{% endfor %}"/>
+{% for textLine in textRegion.textLines %}
+        <TextLine id="{{ textLine.id }}" custom="readingOrder {index:{{ loop.index0 }};}">
+{% if textLine.boundary %}
+            <Coords points="{% for point in textLine.boundary %}{{ point|join(',') }}{% if not loop.last %} {% endif %}{% endfor %}"/>
+{% endif %}
+{% if textLine.baseline %}
+            <Baseline points="{% for point in textLine.baseline %}{{ point|join(',') }}{% if not loop.last %} {% endif %}{% endfor %}"/>
+{% endif %}
+{% if textLine.unicode %}
+            <TextEquiv>
+                <Unicode>{{ textLine.unicode }}</Unicode>
+            </TextEquiv>
+{% endif %}
+        </TextLine>
+{% endfor %}
+    </TextRegion>
+{% endfor %}
+    </Page>
+</PcGts>

src/tests/.gitkeep ADDED Viewed

File without changes

test_api.ipynb ADDED Viewed

	@@ -0,0 +1,479 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded as API: http://127.0.0.1:7860/ ✔\n",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
+      "<PcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd\">\n",
+      "    <Metadata>\n",
+      "        <Creator>Swedish National Archives</Creator>\n",
+      "        <Created>2023-06-13, 15:45:54</Created>\n",
+      "    </Metadata>\n",
+      "    <Page imageFilename=\"test_api\" imageWidth=\"8992\" imageHeight=\"6144\">\n",
+      "    <TextRegion id=\"region_0\" custom=\"readingOrder {index:0;}\">\n",
+      "        <Coords points=\"2154,1057 2170,1112 2205,1147 2300,1181 2401,1195 2643,1162 3188,1151 3311,1159 3367,1128 3392,1077 3373,1041 3316,1017 2597,983 2323,958 2200,966 2175,997\"/>\n",
+      "        <TextLine id=\"line_0_0\" custom=\"readingOrder {index:0;}\">\n",
+      "            <Coords points=\"2196,1087 2206,1115 2259,1114 2313,1149 2534,1153 2714,1133 2748,1136 2810,1165 2963,1134 3309,1143 3340,1136 3351,1111 3343,1064 3230,1036 3113,1027 2901,1038 2814,997 2751,1014 2594,1027 2489,986 2430,993 2377,977 2322,980 2302,996 2242,1009 2203,1041\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Ugglebo socken.</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "    </TextRegion>\n",
+      "    <TextRegion id=\"region_1\" custom=\"readingOrder {index:1;}\">\n",
+      "        <Coords points=\"4161,1345 1523,1306 1351,1372 1364,5271 4223,5273 4243,3085\"/>\n",
+      "        <TextLine id=\"line_1_0\" custom=\"readingOrder {index:0;}\">\n",
+      "            <Coords points=\"1376,1376 1388,1499 1497,1526 1707,1480 3303,1477 3502,1519 3598,1486 4122,1462 4167,1432 4151,1366 1503,1345 1403,1347\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Uggleto socken med Amots Kapelllag, grändas¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_1\" custom=\"readingOrder {index:1;}\">\n",
+      "            <Coords points=\"1365,1590 1382,1628 2618,1621 2812,1654 3009,1624 3660,1619 3949,1659 4179,1618 4199,1526 4147,1489 3903,1505 3805,1484 3505,1527 2808,1501 2397,1526 2143,1478 1402,1538\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>i norr Till Hauebe och Skogs socknar i Helsingland,</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_2\" custom=\"readingOrder {index:2;}\">\n",
+      "            <Coords points=\"1387,1733 1415,1776 1499,1794 3108,1803 4146,1773 4162,1724 4147,1674 3993,1685 3851,1652 2383,1688 2045,1651 1462,1689\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>i oster till Hamrange socken, i sydost till Hille och</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_3\" custom=\"readingOrder {index:3;}\">\n",
+      "            <Coords points=\"4146,1826 4066,1814 3113,1843 1494,1821 1380,1835 1362,1930 1398,1956 2247,1936 2447,2019 2597,1955 2904,1948 3152,1997 3394,1941 3995,1938 4136,1916\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Wahlbo, i söder till Ofvansjö, i sydvest och vester</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_4\" custom=\"readingOrder {index:4;}\">\n",
+      "            <Coords points=\"1353,2083 1394,2115 1609,2106 1850,2150 2398,2101 4085,2100 4150,2083 4148,1977 4061,1965 3032,2006 2747,1945 2447,2015 2303,1974 1991,1996 1698,1954 1394,1967 1363,1997\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>till Svärdsjö socken i Dalarne, och i nordvest till</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_5\" custom=\"readingOrder {index:5;}\">\n",
+      "            <Coords points=\"1361,2222 1396,2269 2462,2259 2651,2281 3003,2261 3303,2291 3554,2257 4058,2255 4153,2235 4159,2181 3910,2120 3668,2145 2638,2147 2445,2119 2250,2150 1700,2156 1497,2118 1376,2147\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Bollnäs socken i Helsingland, intager en större å</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_6\" custom=\"readingOrder {index:6;}\">\n",
+      "            <Coords points=\"1366,2374 1396,2421 1775,2423 1947,2474 2067,2428 2183,2432 2205,2478 2264,2444 2469,2422 3013,2425 3146,2491 3503,2420 4147,2416 4183,2377 4149,2296 3239,2314 2853,2275 2649,2291 2506,2261 2179,2314 1604,2287 1400,2304\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>real än någon af Gestriklands öfrige socknar el¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_7\" custom=\"readingOrder {index:7;}\">\n",
+      "            <Coords points=\"4149,2517 4142,2476 4003,2429 3799,2465 3243,2464 3056,2488 2703,2483 2295,2431 1955,2477 1408,2442 1370,2466 1365,2534 1399,2582 1810,2575 2003,2645 2245,2573 2466,2591 3453,2572 3653,2634 3816,2577 4115,2568\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>ler nära 1/4 af hela provinsen och år ett af Större</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_8\" custom=\"readingOrder {index:8;}\">\n",
+      "            <Coords points=\"4154,2677 4117,2647 3846,2600 3647,2640 2691,2601 2008,2639 1403,2614 1370,2683 1401,2726 1776,2725 1952,2784 2204,2722 2903,2721 3104,2797 3349,2782 3592,2731 3999,2783 4145,2726\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>vattendrag genomskuret, med berg, sjöar och myr¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_9\" custom=\"readingOrder {index:9;}\">\n",
+      "            <Coords points=\"1355,2871 1394,2898 2057,2893 2356,2941 2713,2892 4103,2896 4196,2882 4200,2832 4172,2797 3092,2789 2890,2751 2247,2787 2104,2754 1952,2767 1844,2729 1549,2766 1402,2755 1365,2792\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>landta träkter uppfylldt land, som genom sin nå¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_10\" custom=\"readingOrder {index:10;}\">\n",
+      "            <Coords points=\"1326,2955 1326,3040 1395,3066 2064,3049 2684,3089 2847,3074 2991,3102 3621,3053 3899,3104 4152,3060 4190,3015 4160,2948 2534,2954 2153,2910 1771,2929 1445,2900\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>turskonhet utmärker sig framför provinsens öfrige</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_11\" custom=\"readingOrder {index:11;}\">\n",
+      "            <Coords points=\"1357,3150 1364,3203 1402,3221 2659,3208 3051,3281 3254,3219 4148,3213 4196,3192 4204,3132 4148,3101 3814,3088 2952,3107 2797,3076 2635,3095 2413,3071 2200,3108 2038,3064 1855,3099 1400,3097\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>socknar. Dess största längd från hon till söder</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_12\" custom=\"readingOrder {index:12;}\">\n",
+      "            <Coords points=\"3801,3315 3796,3275 3705,3237 3573,3255 3348,3240 2991,3277 2806,3228 2405,3264 1643,3232 1397,3263 1363,3328 1397,3370 1656,3389 1909,3369 2113,3433 2307,3368 3405,3381 3714,3364 3796,3340\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>år 25/6 mil och från vester till Öster 3 2/3 mil.</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_13\" custom=\"readingOrder {index:13;}\">\n",
+      "            <Coords points=\"4176,3424 4152,3376 3675,3420 3252,3392 2959,3439 2699,3415 2498,3440 1950,3410 1740,3430 1543,3399 1398,3413 1365,3488 1402,3542 3254,3521 3452,3591 3663,3525 4140,3517\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Man räknar, inom denna socken öfver 300 stor</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_14\" custom=\"readingOrder {index:14;}\">\n",
+      "            <Coords points=\"4208,3677 4181,3593 3999,3539 3809,3567 3543,3549 3347,3589 3003,3540 2661,3587 1937,3583 1728,3555 1397,3581 1361,3624 1369,3677 1401,3694 2654,3683 2895,3723 3096,3687 4109,3698\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>re och mindre sjöar och tjernar, hvilka tillsam¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_15\" custom=\"readingOrder {index:15;}\">\n",
+      "            <Coords points=\"1356,3828 1401,3853 2235,3839 2433,3880 2659,3841 2876,3895 3139,3843 3503,3842 3700,3880 4132,3839 4159,3774 4126,3741 3752,3714 3420,3740 1987,3719 1397,3737 1361,3764\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>utans med vattendragen intaga den ansenliga area</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_16\" custom=\"readingOrder {index:16;}\">\n",
+      "            <Coords points=\"2972,3931 2945,3913 2850,3906 2657,3858 2547,3892 2106,3914 1850,3890 1557,3890 1446,3864 1376,3889 1364,3945 1386,4009 1558,4071 1602,4072 1797,4008 1933,3997 2137,4029 2253,3999 2449,3998 2650,4065 2753,4021 2953,3985\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>len af 69093, i qvädratrefvar</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_17\" custom=\"readingOrder {index:17;}\">\n",
+      "            <Coords points=\"3383,3930 3382,3976 3401,3995 3899,3990 4008,4024 4177,4000 4210,3978 4198,3909 4182,3896 3658,3888 3600,3834 3478,3856 3403,3892\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>På norra grån</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_18\" custom=\"readingOrder {index:18;}\">\n",
+      "            <Coords points=\"4138,4116 4104,4047 3904,4009 3620,4050 3449,4018 3148,4050 2947,4014 2719,4064 1951,4011 1612,4061 1397,4056 1362,4132 1397,4177 2790,4166 2878,4181 2954,4234 3083,4173 3238,4152 3750,4165 3903,4222 4107,4162\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>sen och till hälften inom Skogs socken är Lingbo</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_19\" custom=\"readingOrder {index:19;}\">\n",
+      "            <Coords points=\"4115,4231 4082,4210 3846,4224 3401,4198 3254,4164 2990,4226 2255,4186 2091,4212 1704,4180 1396,4209 1363,4327 1396,4365 1561,4318 3347,4316 3544,4347 4088,4319\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>sjön belägen, denna sammanbindes genom ett</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_20\" custom=\"readingOrder {index:20;}\">\n",
+      "            <Coords points=\"4193,4413 4149,4369 3500,4356 3355,4315 3150,4350 1800,4339 1400,4382 1372,4404 1366,4477 1404,4498 1646,4469 3143,4470 3449,4510 3678,4469 4152,4468\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>mält sund med den vidsträckta, sydost derom</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_21\" custom=\"readingOrder {index:21;}\">\n",
+      "            <Coords points=\"1326,4630 1707,4642 1901,4684 2138,4629 3109,4631 3266,4684 3395,4634 4150,4638 4186,4621 4196,4570 4177,4532 4046,4491 3848,4534 3547,4536 2844,4482 2542,4532 2292,4467 2108,4530 1402,4505 1364,4531 1353,4614\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>belägna sjön Ekaren. Vid Östra gränsen utbreda</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_22\" custom=\"readingOrder {index:22;}\">\n",
+      "            <Coords points=\"1364,4784 1399,4829 2365,4780 2749,4837 3162,4774 3396,4839 3592,4791 4118,4791 4150,4776 4163,4674 4103,4650 3976,4683 3659,4693 3386,4630 3329,4677 3247,4694 2740,4689 2502,4625 2350,4677 2043,4650 1840,4690 1552,4645 1392,4689\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>sig Lilla och Stora Daninsjöarne, Tolfvorn samt</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_23\" custom=\"readingOrder {index:23;}\">\n",
+      "            <Coords points=\"4197,4828 3987,4848 3806,4812 3639,4849 3341,4848 3104,4781 2853,4851 2364,4838 2228,4795 2041,4840 1757,4812 1392,4847 1372,4927 1400,4957 3153,4946 3445,5014 3601,4963 3776,5012 3951,4964 4172,4948\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>den till en liten del inom Hansränge liggande</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_24\" custom=\"readingOrder {index:24;}\">\n",
+      "            <Coords points=\"1361,5070 1397,5112 1645,5135 2054,5098 2944,5112 3104,5146 3309,5113 4049,5114 4145,5087 4150,5029 4054,4977 3857,5005 3696,4980 3387,5006 3104,4978 2765,5004 2400,4993 2202,4953 2050,4995 1895,4999 1483,4962 1380,4985\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Vittersjön. På gränsen emot Ofvansjö bemärkas</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_1_25\" custom=\"readingOrder {index:25;}\">\n",
+      "            <Coords points=\"4198,5132 3632,5162 3357,5126 3056,5145 2896,5112 2680,5166 2305,5115 1806,5151 1545,5121 1390,5158 1367,5233 1397,5266 1653,5281 2036,5261 4152,5271 4195,5254\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Hällsjön och Långsjön, hvilka till någon del lig¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "    </TextRegion>\n",
+      "    <TextRegion id=\"region_2\" custom=\"readingOrder {index:2;}\">\n",
+      "        <Coords points=\"7922,724 7937,736 8051,707 8101,688 8148,659 8162,625 8159,613 8148,602 8106,597 7934,593 7923,597 7909,611 7905,624\"/>\n",
+      "        <TextLine id=\"line_2_0\" custom=\"readingOrder {index:0;}\">\n",
+      "            <Coords points=\"7905,613 7905,710 7922,717 7929,716 7932,723 7942,727 7946,733 7982,727 8022,714 8048,716 8100,701 8108,677 8113,671 8119,639 8127,620 8117,621 8103,615 8033,610 8006,602 7974,606 7955,602 7942,609 7940,616 7923,619\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>297.</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "    </TextRegion>\n",
+      "    <TextRegion id=\"region_3\" custom=\"readingOrder {index:3;}\">\n",
+      "        <Coords points=\"7926,824 5441,851 5062,908 5054,4678 5110,5261 7990,5223 7998,2514\"/>\n",
+      "        <TextLine id=\"line_3_0\" custom=\"readingOrder {index:0;}\">\n",
+      "            <Coords points=\"7934,902 7909,860 5383,870 5110,884 5082,908 5079,974 5116,1025 5449,983 7898,966\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>ga inom sistnämnde socken, samt i vestra delen</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_1\" custom=\"readingOrder {index:1;}\">\n",
+      "            <Coords points=\"7871,1070 7842,1027 7279,1016 7131,993 6899,1026 6585,1000 6302,1032 5336,1013 5104,1046 5077,1130 5111,1161 5288,1190 5726,1138 6996,1121 7230,1133 7413,1180 7844,1132\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>af Ugglevo eller den så kallade Finnbyggden,</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_2\" custom=\"readingOrder {index:2;}\">\n",
+      "            <Coords points=\"5067,1240 5078,1330 5122,1345 5395,1295 7111,1286 7351,1322 7553,1287 7898,1284 7929,1232 7899,1180 7090,1157 6692,1189 6400,1131 6109,1188 5900,1176 5727,1132 5515,1194 5124,1178\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Sjöarne Quidsjön, Holmsjön och Macksjon såsom</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_3\" custom=\"readingOrder {index:3;}\">\n",
+      "            <Coords points=\"5072,1459 7920,1440 7942,1401 7897,1346 7683,1319 6062,1340 5839,1299 5679,1337 5387,1312 5107,1350 5078,1376\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>de största. Från nordvestra hörnet, der rå skillna¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_4\" custom=\"readingOrder {index:4;}\">\n",
+      "            <Coords points=\"7885,1559 7852,1499 7620,1467 7163,1487 6896,1460 6342,1506 5123,1514 5079,1578 5114,1632 6255,1614 6458,1638 6686,1613 7866,1599\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>derna mellan denna samt Hanebo och Ballnäs</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_5\" custom=\"readingOrder {index:5;}\">\n",
+      "            <Coords points=\"7919,1677 7898,1622 7577,1655 7223,1654 7015,1618 6735,1653 6338,1636 6037,1667 5337,1647 5113,1662 5076,1747 5112,1783 6179,1779 6399,1810 6676,1763 7888,1761\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>socknar sammanträffa med hvarandra och till</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_6\" custom=\"readingOrder {index:6;}\">\n",
+      "            <Coords points=\"7915,1800 7898,1780 7564,1795 7399,1775 7097,1808 6173,1811 6012,1782 5096,1817 5078,1905 5111,1943 5229,1948 5341,1990 5574,1939 6276,1918 7854,1936 7904,1911\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>det sydvestra hörnet, der Wahlbo och Hille tillstöta</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_7\" custom=\"readingOrder {index:7;}\">\n",
+      "            <Coords points=\"7928,2016 7880,1975 7630,1951 5819,1981 5558,1958 5389,1988 5117,1984 5081,2074 5115,2120 5383,2092 6047,2092 6295,2147 6627,2069 6789,2135 7052,2134 7338,2076 7898,2097\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>genomskäres socknen af dess betydligaste vattendrag,</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_8\" custom=\"readingOrder {index:8;}\">\n",
+      "            <Coords points=\"7863,2134 7837,2114 7635,2142 6862,2147 6623,2079 6349,2146 5175,2117 5095,2137 5077,2191 5120,2260 6183,2241 6395,2270 6659,2238 7076,2266 7817,2242 7849,2221\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>hvilket under namn af Fansan upprinner uti</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_9\" custom=\"readingOrder {index:9;}\">\n",
+      "            <Coords points=\"5085,2414 5115,2458 5282,2413 5895,2396 6124,2416 6288,2390 7346,2397 7510,2431 7853,2390 7894,2347 7856,2269 7683,2247 7525,2281 7286,2247 7032,2306 6791,2294 6618,2253 5440,2307 5225,2243 5103,2293\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Grannäs sjön i Alfta socken inom Helsingland,</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_10\" custom=\"readingOrder {index:10;}\">\n",
+      "            <Coords points=\"5082,2576 5125,2602 5241,2567 5470,2571 5626,2635 5794,2564 6313,2544 7910,2549 7933,2493 7899,2437 7398,2402 7120,2443 6837,2425 6563,2451 6282,2394 6064,2444 5561,2421 5287,2467 5119,2465 5090,2487\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>gar vidare genom Bollnäs socken, samt sedan det</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_11\" custom=\"readingOrder {index:11;}\">\n",
+      "            <Coords points=\"5092,2691 5117,2728 6344,2705 6559,2773 6847,2715 7119,2709 7341,2749 7515,2710 7683,2763 7932,2716 7962,2690 7965,2629 7918,2584 7735,2571 7503,2607 7245,2556 7007,2602 6692,2606 6448,2546 6246,2597 5788,2575 5615,2622 5401,2583 5116,2617\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>derstädes utbredt sig i Fanssjöarne intränger i Uggle¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_12\" custom=\"readingOrder {index:12;}\">\n",
+      "            <Coords points=\"5088,2854 5284,2949 5565,2865 5688,2877 5837,2935 6063,2873 6291,2929 6398,2882 6544,2865 6779,2873 7007,2930 7119,2895 7900,2869 7937,2851 7944,2798 7899,2764 7512,2712 7170,2762 6512,2768 6212,2710 6026,2753 5853,2718 5631,2755 5222,2734 5106,2762\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>bo, efter att hafva vid gränsen deraf, upptagit den</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_13\" custom=\"readingOrder {index:13;}\">\n",
+      "            <Coords points=\"7948,2959 7937,2913 7674,2920 7517,2883 7344,2923 6897,2925 6712,2894 6522,2929 6178,2906 5961,2937 5543,2886 5341,2936 5118,2915 5085,2987 5116,3066 5350,3035 5736,3083 6057,3027 7039,3026 7289,3043 7454,3091 7909,3019\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>från Nybosjon i Bollnäs, kommande Rymje el¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_14\" custom=\"readingOrder {index:14;}\">\n",
+      "            <Coords points=\"5084,3130 5088,3191 5120,3212 5820,3183 7463,3178 7676,3258 7953,3192 7954,3071 7784,3057 7469,3087 6514,3056 6263,3083 6072,3045 5850,3091 5463,3058 5111,3084\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>ler Svartån, Så snart det inkommit i Uggle¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_15\" custom=\"readingOrder {index:15;}\">\n",
+      "            <Coords points=\"5086,3308 5120,3364 6291,3341 6510,3383 6618,3350 6980,3341 7236,3372 7416,3345 7901,3341 7952,3242 7515,3242 7291,3191 7091,3247 6671,3250 6347,3206 6115,3233 5833,3191 5691,3222 5122,3223\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>bo socken bildar det några smärre tjernar, samt</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_16\" custom=\"readingOrder {index:16;}\">\n",
+      "            <Coords points=\"7931,3412 7899,3366 7244,3399 6894,3360 6509,3388 6182,3370 5905,3404 5222,3373 5107,3391 5081,3508 5122,3563 5180,3527 5336,3511 5538,3524 5623,3563 5786,3511 6129,3506 6509,3565 6623,3517 7010,3512 7129,3555 7341,3501 7459,3507 7574,3555 7898,3493\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>flyter derefter med obetydliga krökningar i sydost¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_17\" custom=\"readingOrder {index:17;}\">\n",
+      "            <Coords points=\"7945,3622 7902,3546 7587,3539 7460,3502 7232,3546 6467,3567 6175,3524 5796,3578 5522,3539 5286,3554 5223,3522 5115,3562 5082,3649 5121,3733 5346,3676 5623,3698 6064,3679 6183,3711 6337,3671 6572,3689 6675,3729 6789,3675 6952,3652 7349,3687 7576,3672 7740,3704 7915,3665\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>lig riktning, upptager ofvanför Amots by, ett, från</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_18\" custom=\"readingOrder {index:18;}\">\n",
+      "            <Coords points=\"7939,3742 7905,3703 7797,3689 7174,3708 6897,3673 6736,3711 6343,3683 6173,3714 5778,3722 5505,3686 5123,3733 5090,3806 5116,3843 5350,3841 5505,3882 5679,3842 6512,3816 6974,3863 7899,3814 7930,3797\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>de vid gränsen emot Bollnäs belägne tjernar kom¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_19\" custom=\"readingOrder {index:19;}\">\n",
+      "            <Coords points=\"7881,3892 7851,3843 7637,3870 5119,3890 5080,3971 5115,4008 6343,3977 6574,4039 6810,3984 7843,3970\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>mande mindre vattendrag, samt vidare vid</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_20\" custom=\"readingOrder {index:20;}\">\n",
+      "            <Coords points=\"5109,4130 5179,4151 6633,4137 6850,4201 7053,4136 7886,4122 7929,4073 7900,4018 6933,4016 6446,4042 6119,4006 5509,4002 5142,4036 5114,4058\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Amots bruk Kölsjö än, som lopande söder om</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_21\" custom=\"readingOrder {index:21;}\">\n",
+      "            <Coords points=\"7904,4191 7475,4164 7226,4186 7008,4156 6853,4194 6506,4169 6276,4198 5835,4163 5450,4201 5281,4163 5117,4188 5093,4298 5124,4320 7788,4289 7889,4271\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Fans- och Svartåarne har sin kalla i den inom</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_22\" custom=\"readingOrder {index:22;}\">\n",
+      "            <Coords points=\"7921,4358 7887,4330 7671,4336 7511,4301 7126,4348 6792,4315 6517,4359 6240,4309 5962,4348 5234,4326 5115,4350 5097,4415 5120,4467 5989,4453 6239,4484 6571,4450 6846,4475 7843,4451 7911,4424\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>Ballnäs socken belägna Kölsjön. Förenade</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_23\" custom=\"readingOrder {index:23;}\">\n",
+      "            <Coords points=\"5136,4640 5174,4675 5237,4641 5397,4624 7060,4608 7406,4657 7619,4608 7905,4598 7939,4519 7906,4481 6605,4506 5284,4469 5152,4520\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>fortsätta nu dessa vatten under hamn af Amots</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_24\" custom=\"readingOrder {index:24;}\">\n",
+      "            <Coords points=\"7959,4659 7922,4613 7736,4603 7516,4657 7006,4612 6749,4668 6479,4609 6226,4666 5500,4648 5126,4703 5106,4750 5160,4787 5889,4802 6291,4769 6505,4820 6734,4768 7053,4761 7294,4827 7522,4763 7844,4799 7955,4751\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>än sitt lopp genom Påls jon och Wallsjön till By¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_25\" custom=\"readingOrder {index:25;}\">\n",
+      "            <Coords points=\"7972,4853 7952,4823 7633,4779 7245,4819 6847,4787 6564,4831 5667,4805 5505,4840 5178,4845 5147,4922 5170,4970 5504,4920 6632,4960 7111,4928 7961,4912\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>sjön. Sistnämnde sjö upphemtar dessutom norr</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_26\" custom=\"readingOrder {index:26;}\">\n",
+      "            <Coords points=\"5125,5078 5170,5121 5558,5094 6176,5086 6416,5114 6720,5091 6900,5143 7019,5130 7125,5154 7340,5070 7886,5070 7947,5020 7903,4966 5995,4995 5776,4955 5570,4987 5174,4992 5131,5019\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>ifrån Moan, som upprinner på gränsen emel¬</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "        <TextLine id=\"line_3_27\" custom=\"readingOrder {index:27;}\">\n",
+      "            <Coords points=\"7953,5184 7924,5144 7735,5095 7508,5116 7342,5083 7185,5141 7006,5148 6685,5115 6398,5144 6182,5110 5728,5145 5548,5113 5396,5149 5151,5147 5110,5200 5118,5274 5185,5274 5187,5249 7870,5239 7934,5227\"/>\n",
+      "            <TextEquiv>\n",
+      "                <Unicode>lan Skogs och Hauebo socknar, samt emottager</Unicode>\n",
+      "            </TextEquiv>\n",
+      "        </TextLine>\n",
+      "    </TextRegion>\n",
+      "    </Page>\n",
+      "</PcGts>\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gradio_client import Client\n",
+    "\n",
+    "client = Client(\"http://127.0.0.1:7860/\")\n",
+    "job = client.submit(\n",
+    "    \"./helper/examples/images/1861_R0000277_00153.jpg\",  # str (filepath or URL to image) in 'Image to run HTR-pipeline on' Image component\n",
+    "    \"test_api\",  # str  in 'parameter_22' Textbox component\n",
+    "    api_name=\"/predict\",\n",
+    ")\n",
+    "\n",
+    "print(job.result())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7861\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7861/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "\n",
+    "def create_object(arg):\n",
+    "    return gr.Textbox(value=arg)\n",
+    "\n",
+    "my_objects = {}\n",
+    "\n",
+    "test_list =[\"first\", \"second\"] \n",
+    "for i in test_list:\n",
+    "    object_name = f\"object_{i}\"\n",
+    "    new_object = create_object(i)\n",
+    "    my_objects[object_name] = new_object\n",
+    "\n",
+    "# Accessing objects by their assigned names\n",
+    "first_object = my_objects[\"object_first\"]\n",
+    "second_object = my_objects[\"object_second\"]\n",
+    "\n",
+    "with gr.Blocks() as test:\n",
+    "    with gr.Row():\n",
+    "        first_object.render()\n",
+    "    with gr.Row():\n",
+    "        second_object.render()\n",
+    "\n",
+    "test.launch()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}