Gabriel commited on
Commit
417b347
·
1 Parent(s): 27476a6

added new dataset

Browse files
.gitignore CHANGED
@@ -21,3 +21,5 @@ src/htr_pipeline.egg-info/
21
  page_xml.xml
22
  page_txt.txt
23
  transcribed_text.txt
 
 
 
21
  page_xml.xml
22
  page_txt.txt
23
  transcribed_text.txt
24
+ helper/examples/.cache_images/
25
+ helper/examples/images/*.jpg
app.py CHANGED
@@ -1,17 +1,17 @@
 
 
 
1
  import gradio as gr
2
 
3
- from helper.examples.examples import ExamplesImages
4
  from helper.gradio_config import css, js, theme
5
- from helper.text.text_about import TextAbout
6
- from helper.text.text_app import TextApp
7
- from helper.text.text_howto import TextHowTo
8
- from helper.text.text_riksarkivet import TextRiksarkivet
9
- from helper.text.text_roadmap import TextRoadmap
10
  from src.htr_pipeline.gradio_backend import CustomTrack, FastTrack, SingletonModelLoader
11
 
12
  model_loader = SingletonModelLoader()
13
  fast_track = FastTrack(model_loader)
14
  custom_track = CustomTrack(model_loader)
 
15
 
16
  with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
17
  gr.Markdown(" ")
@@ -36,15 +36,17 @@ with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
36
  # visible=True,
37
  # ).style(full_width=True)
38
  radio_file_input = gr.Radio(
39
- value="Text file", choices=["Text file", "Page XML"], label="What kind file output?"
40
  )
 
 
41
 
42
  htr_pipeline_button = gr.Button(
43
  "Run HTR",
44
  variant="primary",
45
  visible=True,
46
  elem_id="run_pipeline_button",
47
- ).style(full_width=False)
48
 
49
  with gr.Group():
50
  with gr.Row():
@@ -54,8 +56,8 @@ with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
54
  fast_name_files_placeholder = gr.Markdown(visible=False)
55
 
56
  gr.Examples(
57
- examples=ExamplesImages.example_images_with_info,
58
- inputs=[fast_track_input_region_image, fast_name_files_placeholder],
59
  label="Example images",
60
  examples_per_page=3,
61
  )
@@ -82,6 +84,7 @@ with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
82
  with gr.Tab("1. Region Segmentation"):
83
  with gr.Row():
84
  with gr.Column(scale=2):
 
85
  name_files_placeholder = gr.Markdown(visible=False)
86
 
87
  with gr.Row():
@@ -132,8 +135,8 @@ with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
132
  with gr.Row():
133
  with gr.Accordion("Example images to use:", open=False) as example_accord:
134
  gr.Examples(
135
- examples=ExamplesImages.example_images_with_info,
136
- inputs=[input_region_image, name_files_placeholder],
137
  label="Example images",
138
  examples_per_page=2,
139
  )
@@ -161,7 +164,7 @@ with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
161
  columns=[2],
162
  rows=[2],
163
  # object_fit="contain",
164
- height=300,
165
  preview=True,
166
  container=False,
167
  )
@@ -474,8 +477,14 @@ with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
474
  outputs=[txt_file_downlod, txt_file_downlod],
475
  )
476
 
 
 
 
 
 
477
  clear_button.click(
478
  lambda: (
 
479
  None,
480
  None,
481
  None,
@@ -494,6 +503,7 @@ with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
494
  ),
495
  inputs=[],
496
  outputs=[
 
497
  input_region_image,
498
  regions_cropped_gallery,
499
  input_region_from_gallery,
@@ -520,3 +530,5 @@ demo.queue(concurrency_count=5, max_size=20)
520
 
521
  if __name__ == "__main__":
522
  demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False, show_error=True)
 
 
 
1
+ import os
2
+ import shutil
3
+
4
  import gradio as gr
5
 
6
+ from helper.examples.examples import DemoImages
7
  from helper.gradio_config import css, js, theme
8
+ from helper.text import TextAbout, TextApp, TextHowTo, TextRiksarkivet, TextRoadmap
 
 
 
 
9
  from src.htr_pipeline.gradio_backend import CustomTrack, FastTrack, SingletonModelLoader
10
 
11
  model_loader = SingletonModelLoader()
12
  fast_track = FastTrack(model_loader)
13
  custom_track = CustomTrack(model_loader)
14
+ images_for_demo = DemoImages()
15
 
16
  with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
17
  gr.Markdown(" ")
 
36
  # visible=True,
37
  # ).style(full_width=True)
38
  radio_file_input = gr.Radio(
39
+ value="Text file", choices=["Text file ", "Page XML file "], label="What kind file output?"
40
  )
41
+ with gr.Row():
42
+ htr_clear_button = gr.Button("", variant="Secondary")
43
 
44
  htr_pipeline_button = gr.Button(
45
  "Run HTR",
46
  variant="primary",
47
  visible=True,
48
  elem_id="run_pipeline_button",
49
+ ).style(full_width=True)
50
 
51
  with gr.Group():
52
  with gr.Row():
 
56
  fast_name_files_placeholder = gr.Markdown(visible=False)
57
 
58
  gr.Examples(
59
+ examples=images_for_demo.examples_list,
60
+ inputs=[fast_name_files_placeholder, fast_track_input_region_image],
61
  label="Example images",
62
  examples_per_page=3,
63
  )
 
84
  with gr.Tab("1. Region Segmentation"):
85
  with gr.Row():
86
  with gr.Column(scale=2):
87
+ vis_data_folder_placeholder = gr.Markdown(visible=False)
88
  name_files_placeholder = gr.Markdown(visible=False)
89
 
90
  with gr.Row():
 
135
  with gr.Row():
136
  with gr.Accordion("Example images to use:", open=False) as example_accord:
137
  gr.Examples(
138
+ examples=images_for_demo.examples_list,
139
+ inputs=[name_files_placeholder, input_region_image],
140
  label="Example images",
141
  examples_per_page=2,
142
  )
 
164
  columns=[2],
165
  rows=[2],
166
  # object_fit="contain",
167
+ height=400,
168
  preview=True,
169
  container=False,
170
  )
 
477
  outputs=[txt_file_downlod, txt_file_downlod],
478
  )
479
 
480
+ # def remove_temp_vis():
481
+ # if os.path.exists("./vis_data"):
482
+ # os.remove("././vis_data")
483
+ # return None
484
+
485
  clear_button.click(
486
  lambda: (
487
+ (shutil.rmtree("./vis_data") if os.path.exists("./vis_data") else None, None)[1],
488
  None,
489
  None,
490
  None,
 
503
  ),
504
  inputs=[],
505
  outputs=[
506
+ vis_data_folder_placeholder,
507
  input_region_image,
508
  regions_cropped_gallery,
509
  input_region_from_gallery,
 
530
 
531
  if __name__ == "__main__":
532
  demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False, show_error=True)
533
+ if __name__ == "__main__":
534
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False, show_error=True)
helper/examples/create_examples.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+
3
+ _CITATION = """\
4
+ @InProceedings{huggingface:dataset,
5
+ title = {Small htr examples images},
6
+ author={Gabriel Borg},
7
+ year={2023}
8
+ }
9
+ """
10
+
11
+ _DESCRIPTION = """\
12
+ Demo dataset for the htr demo.
13
+ """
14
+ _HOMEPAGE = "https://huggingface.co/datasets/Riksarkivet/test_images_demo"
15
+
16
+ _LICENSE = ""
17
+
18
+ _REPO = "https://huggingface.co/datasets/Riksarkivet/test_images_demo"
19
+
20
+
21
+ class ExampleImages(datasets.GeneratorBasedBuilder):
22
+ """Small sample of image-text pairs"""
23
+
24
+ def _info(self):
25
+ return datasets.DatasetInfo(
26
+ description=_DESCRIPTION,
27
+ features=datasets.Features(
28
+ {
29
+ "text": datasets.Value("string"),
30
+ "image": datasets.Image(),
31
+ }
32
+ ),
33
+ supervised_keys=None,
34
+ homepage=_HOMEPAGE,
35
+ citation=_CITATION,
36
+ )
37
+
38
+ def _split_generators(self, dl_manager):
39
+ images_archive = dl_manager.download(f"{_REPO}/resolve/main/images.tar.gz")
40
+ metadata_path = dl_manager.download(f"{_REPO}/resolve/main/images.txt")
41
+ image_iters = dl_manager.iter_archive(images_archive)
42
+ return [
43
+ datasets.SplitGenerator(
44
+ name=datasets.Split.TRAIN, gen_kwargs={"images": image_iters, "metadata_path": metadata_path}
45
+ ),
46
+ ]
47
+
48
+ def _generate_examples(self, images, metadata_path):
49
+ """Generate images and text."""
50
+ with open(metadata_path, encoding="utf-8") as f:
51
+ metadata_list = f.read().split("\n")
52
+ for idx, (img_obj, meta_txt) in enumerate(zip(images, metadata_list)):
53
+ filepath, image = img_obj
54
+ yield idx, {
55
+ "image": {"path": filepath, "bytes": image.read()},
56
+ "text": meta_txt,
57
+ }
58
+
59
+
60
+ if __name__ == "__main__":
61
+ pass
helper/examples/examples.py CHANGED
@@ -1,20 +1,38 @@
1
- class ExamplesImages:
2
- image_path = "./helper/examples/images"
3
- example_images_with_info = [
4
- [f"{image_path}/1664-Handelskollegiet_A1_0014full.jpg", "1664 HandelsKollegiet"],
5
- [
6
- f"{image_path}/1735-Södra_förstadens_kämnärsrätt_00042-scan_2020-10-13_14-03-37.jpg",
7
- "1735 Södra förstadens kämnärsrätt",
8
- ],
9
- [f"{image_path}/1777-Hall-_och_Manufakturrätten_HallMan_Sida_03.jpg", "1777 Hall och Manufakturrätten"],
10
- [f"{image_path}/1840-1890_H0000304_00034.jpg", "1840-1890 --"],
11
- [f"{image_path}/1861_R0000277_00153.jpg", "1861 --"],
12
- [f"{image_path}/1664-Handelskollegiet_A1_0014full.jpg", "1664 HandelsKollegiet"],
13
- [
14
- f"{image_path}/1735-Södra_förstadens_kämnärsrätt_00042-scan_2020-10-13_14-03-37.jpg",
15
- "1735 Södra förstadens kämnärsrätt",
16
- ],
17
- [f"{image_path}/1777-Hall-_och_Manufakturrätten_HallMan_Sida_03.jpg", "1777 Hall och Manufakturrätten"],
18
- [f"{image_path}/1840-1890_H0000304_00034.jpg", "1840-1890 --"],
19
- [f"{image_path}/1861_R0000277_00153.jpg", "1861 --"],
20
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+
3
+ import datasets
4
+ from PIL import Image
5
+
6
+
7
+ class DemoImages:
8
+ def __init__(self, url="Riksarkivet/test_images_demo", cache_dir="./helper/examples/.cache_images") -> None:
9
+ self.images_datasets = datasets.load_dataset(url, cache_dir=cache_dir)
10
+ self.example_df = self.images_datasets["train"].to_pandas()
11
+ self.examples_list = self.convert_bytes_to_images()
12
+
13
+ def convert_bytes_to_images(self):
14
+ examples_list = []
15
+ # For each row in the dataframe
16
+ for index, row in self.example_df.iterrows():
17
+ image_bytes = row["image"]["bytes"]
18
+ image = Image.open(io.BytesIO(image_bytes))
19
+
20
+ # Set the path to save the image
21
+ path_to_image = f"./helper/examples/images/image_{index}.jpg"
22
+
23
+ # Save the image
24
+ image.save(path_to_image)
25
+
26
+ # Get the description
27
+ description = row["text"]
28
+
29
+ # Append to the examples list
30
+ examples_list.append([description, path_to_image])
31
+
32
+ return examples_list
33
+
34
+
35
+ if __name__ == "__main__":
36
+ test = DemoImages(cache_dir=".cache_images")
37
+
38
+ print(test.examples_list)
helper/text/__init__.py CHANGED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from helper.text.text_about import TextAbout
2
+ from helper.text.text_app import TextApp
3
+ from helper.text.text_howto import TextHowTo
4
+ from helper.text.text_riksarkivet import TextRiksarkivet
5
+ from helper.text.text_roadmap import TextRoadmap
6
+
7
+ if __name__ == "__main__":
8
+ pass
helper/text/text_about.py CHANGED
@@ -70,3 +70,7 @@ class TextAbout:
70
  Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
71
 
72
  """
 
 
 
 
 
70
  Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
71
 
72
  """
73
+
74
+
75
+ if __name__ == "__main__":
76
+ pass
helper/text/text_app.py CHANGED
@@ -6,3 +6,7 @@ class TextApp:
6
  <h1><center> Handwritten Text Recognition Tool </center></h1>
7
 
8
  <h3><center> Swedish National Archives - Riksarkivet </center></h3>"""
 
 
 
 
 
6
  <h1><center> Handwritten Text Recognition Tool </center></h1>
7
 
8
  <h3><center> Swedish National Archives - Riksarkivet </center></h3>"""
9
+
10
+
11
+ if __name__ == "__main__":
12
+ pass
helper/text/text_howto.py CHANGED
@@ -92,3 +92,7 @@ To explore the HTR results, follow these steps:
92
  ## &nbsp;
93
  Alternatively, you can watch the instructional video below, which provides a step-by-step walkthrough of the HTR Tool and some additional features.
94
  """
 
 
 
 
 
92
  ## &nbsp;
93
  Alternatively, you can watch the instructional video below, which provides a step-by-step walkthrough of the HTR Tool and some additional features.
94
  """
95
+
96
+
97
+ if __name__ == "__main__":
98
+ pass
helper/text/text_riksarkivet.py CHANGED
@@ -8,3 +8,7 @@ class TextRiksarkivet:
8
  ## Contact us
9
  Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
10
  """
 
 
 
 
 
8
  ## Contact us
9
  Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
10
  """
11
+
12
+
13
+ if __name__ == "__main__":
14
+ pass
helper/text/text_roadmap.py CHANGED
@@ -15,3 +15,7 @@ class TextRoadmap:
15
 
16
  Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
17
  """
 
 
 
 
 
15
 
16
  Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
17
  """
18
+
19
+
20
+ if __name__ == "__main__":
21
+ pass
requirements.txt CHANGED
@@ -8,6 +8,7 @@ opencv-python-headless
8
  jinja2
9
  transformers
10
  huggingface_hub
 
11
  requests
12
  # scipy
13
  # sklearn
 
8
  jinja2
9
  transformers
10
  huggingface_hub
11
+ datasets
12
  requests
13
  # scipy
14
  # sklearn
src/htr_pipeline/models.py CHANGED
@@ -57,3 +57,7 @@ class HtrModels:
57
  }
58
 
59
  return config_path
 
 
 
 
 
57
  }
58
 
59
  return config_path
60
+
61
+
62
+ if __name__ == "__main__":
63
+ pass
src/htr_pipeline/utils/filter_segmask.py CHANGED
@@ -124,4 +124,7 @@ class FilterSegMask:
124
 
125
  new_filtered_result.pred_instances = new_pred_instances
126
  return new_filtered_result
127
- return new_filtered_result
 
 
 
 
124
 
125
  new_filtered_result.pred_instances = new_pred_instances
126
  return new_filtered_result
127
+
128
+
129
+ if __name__ == "__main__":
130
+ pass
src/htr_pipeline/utils/helper.py CHANGED
@@ -90,10 +90,3 @@ if __name__ == "__main__":
90
  kwargs={"spam": "eggs"},
91
  )
92
  print(retval)
93
-
94
- # Example of using the decorator
95
- retval = another_long_running_function()
96
- print(retval)
97
- retval = another_long_running_function()
98
- print(retval)
99
- print(retval)
 
90
  kwargs={"spam": "eggs"},
91
  )
92
  print(retval)
 
 
 
 
 
 
 
src/htr_pipeline/utils/order_of_object.py CHANGED
@@ -86,3 +86,7 @@ class OrderObject:
86
 
87
  # Return the ordered regions
88
  return df["region_id"].tolist()
 
 
 
 
 
86
 
87
  # Return the ordered regions
88
  return df["region_id"].tolist()
89
+
90
+
91
+ if __name__ == "__main__":
92
+ pass
src/htr_pipeline/utils/parser_xml.py CHANGED
@@ -74,3 +74,7 @@ class XmlParser:
74
  text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text
75
  f.write(text + "\n")
76
  f.write("\n")
 
 
 
 
 
74
  text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text
75
  f.write(text + "\n")
76
  f.write("\n")
77
+
78
+
79
+ if __name__ == "__main__":
80
+ pass
src/htr_pipeline/utils/preprocess_img.py CHANGED
@@ -17,3 +17,7 @@ class Preprocess:
17
  img_gradio = cv2.cvtColor(threshed, cv2.COLOR_BGR2RGB)
18
 
19
  return img_gradio
 
 
 
 
 
17
  img_gradio = cv2.cvtColor(threshed, cv2.COLOR_BGR2RGB)
18
 
19
  return img_gradio
20
+
21
+
22
+ if __name__ == "__main__":
23
+ pass
src/htr_pipeline/utils/process_segmask.py CHANGED
@@ -85,3 +85,7 @@ class SegMaskHelper:
85
  translated_line_polygons = [[[a + box[0], b + box[1]] for [a, b] in poly] for poly in line_polygons]
86
 
87
  return translated_line_polygons
 
 
 
 
 
85
  translated_line_polygons = [[[a + box[0], b + box[1]] for [a, b] in poly] for poly in line_polygons]
86
 
87
  return translated_line_polygons
88
+
89
+
90
+ if __name__ == "__main__":
91
+ pass
src/htr_pipeline/utils/process_xml.py CHANGED
@@ -148,3 +148,7 @@ class XMLHelper:
148
  text_lines.append(line_data)
149
 
150
  return text_lines, htr_scores
 
 
 
 
 
148
  text_lines.append(line_data)
149
 
150
  return text_lines, htr_scores
151
+
152
+
153
+ if __name__ == "__main__":
154
+ pass