Spaces:
Sleeping
Sleeping
| import os | |
| from mamba import description, it, before | |
| from pdftocgen.filter import ( | |
| ToCFilter, | |
| admits_float, | |
| FontFilter, | |
| BoundingBoxFilter | |
| ) | |
| dirpath = os.path.dirname(os.path.abspath(__file__)) | |
| with description("admits_float") as self: | |
| with it("admits if difference is below tol"): | |
| assert admits_float(1, 1.05, 0.1) | |
| assert admits_float(1, 0.95, 0.1) | |
| with it("does not admit if difference is too large"): | |
| assert not admits_float(1, 1.5, 0.1) | |
| assert not admits_float(1, 0.5, 0.1) | |
| with it("admits anything if expect is unset"): | |
| assert admits_float(None, 1, 0.1) | |
| assert admits_float(None, None, 0.1) | |
| with it("does not admit if expect is set but actual is None"): | |
| assert not admits_float(1, None, 0.1) | |
| with description("ToCFilter") as self: | |
| with before.all: | |
| self.title_exact = { | |
| 'level': 1, | |
| 'font': { | |
| 'name': "CMBX12", | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }, | |
| 'bbox': { | |
| 'left': 157.98439025878906, | |
| 'top': 567.3842163085938, | |
| 'right': 245.18057250976562, | |
| 'bottom': 581.7447509765625, | |
| 'tolerance': 0 | |
| } | |
| } | |
| self.text_exact = { | |
| 'level': 2, | |
| 'font': { | |
| 'name': "CMR10", | |
| 'size': 9.962599754333496, | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': False | |
| }, | |
| 'bbox': { | |
| 'left': 133.76800537109375, | |
| 'top': 592.492919921875, | |
| 'right': 477.537353515625, | |
| 'bottom': 602.4555053710938, | |
| 'tolerance': 0 | |
| } | |
| } | |
| self.spn_title = { | |
| 'size': 14.346199989318848, | |
| 'flags': 20, | |
| 'font': 'TZOLRB+CMBX12', | |
| 'color': 0, | |
| 'text': 'Section Two', | |
| 'bbox': (157.98439025878906, | |
| 567.3842163085938, | |
| 245.18057250976562, | |
| 581.7447509765625) | |
| } | |
| self.spn_text = { | |
| 'size': 9.962599754333496, | |
| 'flags': 4, | |
| 'font': 'MJDLZY+CMR10', | |
| 'color': 0, | |
| 'text': 'text', | |
| 'bbox': (133.76800537109375, | |
| 592.492919921875, | |
| 477.537353515625, | |
| 602.4555053710938) | |
| } | |
| with it("raises error if no toc level is specified"): | |
| try: | |
| fltr = ToCFilter({}) | |
| except ValueError: | |
| pass | |
| except: | |
| assert False, "must raise error" | |
| with it("raises error if toc level is invalid"): | |
| try: | |
| fltr = ToCFilter({'level': 0}) | |
| fltr = ToCFilter({'level': -1}) | |
| except ValueError: | |
| pass | |
| except: | |
| assert False, "must raise error" | |
| with it("does not raise error if toc level is valid"): | |
| try: | |
| fltr = ToCFilter({'level': 1}) | |
| fltr = ToCFilter({'level': 2}) | |
| except ValueError: | |
| assert False, "must not raise error" | |
| with it("admits exact matches"): | |
| filter_title = ToCFilter(self.title_exact) | |
| filter_text = ToCFilter(self.text_exact) | |
| assert filter_title.admits(self.spn_title) | |
| assert filter_text.admits(self.spn_text) | |
| with it("rejects unmatched spans"): | |
| filter_title = ToCFilter(self.title_exact) | |
| filter_text = ToCFilter(self.text_exact) | |
| assert not filter_title.admits(self.spn_text) | |
| assert not filter_text.admits(self.spn_title) | |
| with it("admits correctly without bbox"): | |
| filter_title = ToCFilter({ | |
| 'level': 1, | |
| 'font': { | |
| 'name': "CMBX12", | |
| } | |
| }) | |
| assert filter_title.admits(self.spn_title) | |
| filter_text = ToCFilter({ | |
| 'level': 2, | |
| 'font': { | |
| 'size': 9.962599754333496, | |
| } | |
| }) | |
| assert filter_text.admits(self.spn_text) | |
| with it("rejects correctly without bbox"): | |
| filter_title = ToCFilter({ | |
| 'level': 1, | |
| 'font': { | |
| 'name': "CMBX12", | |
| } | |
| }) | |
| assert not filter_title.admits(self.spn_text) | |
| filter_text = ToCFilter({ | |
| 'level': 2, | |
| 'font': { | |
| 'size': 9.962599754333496, | |
| } | |
| }) | |
| assert not filter_text.admits(self.spn_title) | |
| with it("admits correctly without font"): | |
| filter_title = ToCFilter({ | |
| 'level': 1, | |
| 'bbox': { | |
| 'left': 157.98439025878906, | |
| } | |
| }) | |
| assert filter_title.admits(self.spn_title) | |
| filter_text = ToCFilter({ | |
| 'level': 2, | |
| 'bbox': { | |
| 'top': 592.492919921875, | |
| } | |
| }) | |
| assert filter_text.admits(self.spn_text) | |
| with it("rejects correctly without font"): | |
| filter_title = ToCFilter({ | |
| 'level': 1, | |
| 'bbox': { | |
| 'left': 157.98439025878906, | |
| } | |
| }) | |
| assert not filter_title.admits(self.spn_text) | |
| filter_text = ToCFilter({ | |
| 'level': 2, | |
| 'bbox': { | |
| 'top': 592.492919921875, | |
| } | |
| }) | |
| assert not filter_text.admits(self.spn_title) | |
| with description("FontFilter") as self: | |
| with before.all: | |
| self.title_exact = { | |
| 'name': "CMBX12", | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| } | |
| self.text_exact = { | |
| 'name': "CMR10", | |
| 'size': 9.962599754333496, | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': False | |
| } | |
| self.spn_title = { | |
| 'size': 14.346199989318848, | |
| 'flags': 20, | |
| 'font': 'TZOLRB+CMBX12', | |
| 'color': 0, | |
| 'text': 'Section Two', | |
| 'bbox': (157.98439025878906, | |
| 567.3842163085938, | |
| 245.18057250976562, | |
| 581.7447509765625) | |
| } | |
| self.spn_small_title = { | |
| 'size': 9.962599754333496, | |
| 'flags': 4, | |
| 'font': 'TZOLRB+CMBX12', | |
| 'color': 0, | |
| 'text': 'text', | |
| 'bbox': (133.76800537109375, | |
| 592.492919921875, | |
| 477.537353515625, | |
| 602.4555053710938) | |
| } | |
| self.spn_text = { | |
| 'size': 9.962599754333496, | |
| 'flags': 4, | |
| 'font': 'MJDLZY+CMR10', | |
| 'color': 0, | |
| 'text': 'text', | |
| 'bbox': (133.76800537109375, | |
| 592.492919921875, | |
| 477.537353515625, | |
| 602.4555053710938) | |
| } | |
| with it("has a working constructor"): | |
| fnt = FontFilter(self.title_exact) | |
| assert fnt.name.search("TZOLRB+CMBX12") | |
| assert fnt.name.search("CMBX12") | |
| assert not fnt.name.search("CMBX10") | |
| assert fnt.flags == 0b10100 | |
| assert fnt.ign_mask == 0b11111 | |
| assert fnt.color == 0x000000 | |
| assert fnt.size == 14.346199989318848 | |
| assert fnt.size_tolerance == 0 | |
| with it("can construct if empty dict is given in the constructor"): | |
| fnt = FontFilter({}) | |
| assert fnt.name.search("anything") | |
| assert fnt.flags == 0 | |
| assert fnt.ign_mask == 0 | |
| assert fnt.color is None | |
| assert fnt.size is None | |
| assert fnt.size_tolerance == 1e-5 | |
| with it("admits exact matches"): | |
| fnt_title = FontFilter(self.title_exact) | |
| fnt_text = FontFilter(self.text_exact) | |
| assert fnt_title.admits(self.spn_title) | |
| assert fnt_text.admits(self.spn_text) | |
| with it("rejects unmatched spans"): | |
| fnt_title = FontFilter(self.title_exact) | |
| assert not fnt_title.admits(self.spn_text) | |
| assert not fnt_title.admits(self.spn_small_title) | |
| fnt_text = FontFilter(self.text_exact) | |
| assert not fnt_text.admits(self.spn_title) | |
| assert not fnt_text.admits(self.spn_small_title) | |
| with it("admits correctly without font name"): | |
| fnt_title = FontFilter({ | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }) | |
| assert fnt_title.admits(self.spn_title) | |
| with it("rejects correctly without font name"): | |
| fnt_title = FontFilter({ | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }) | |
| assert not fnt_title.admits(self.spn_text) | |
| assert not fnt_title.admits(self.spn_small_title) | |
| with it("admits correctly with only font name"): | |
| fnt_title = FontFilter({ | |
| 'name': "CMBX12" | |
| }) | |
| assert fnt_title.admits(self.spn_title) | |
| assert fnt_title.admits(self.spn_small_title) | |
| with it("rejects correctly with only font name"): | |
| fnt_title = FontFilter({ | |
| 'name': "CMBX12" | |
| }) | |
| assert not fnt_title.admits(self.spn_text) | |
| with it("admits correctly without size"): | |
| fnt_title = FontFilter({ | |
| 'name': "CMBX12", | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }) | |
| assert fnt_title.admits(self.spn_title) | |
| with it("rejects correctly without size"): | |
| fnt_title = FontFilter({ | |
| 'name': "CMBX12", | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }) | |
| assert not fnt_title.admits(self.spn_text) | |
| assert not fnt_title.admits(self.spn_small_title) | |
| with it("admits correctly with only size"): | |
| fnt_title = FontFilter({ | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0 | |
| }) | |
| assert fnt_title.admits(self.spn_title) | |
| with it("rejects correctly with only size"): | |
| fnt_title = FontFilter({ | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0 | |
| }) | |
| assert not fnt_title.admits(self.spn_text) | |
| assert not fnt_title.admits(self.spn_small_title) | |
| with it("admits correctly without color"): | |
| fnt_title = FontFilter({ | |
| 'name': "CMBX12", | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }) | |
| assert fnt_title.admits(self.spn_title) | |
| with it("rejects correctly without color"): | |
| fnt_title = FontFilter({ | |
| 'name': "CMBX12", | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }) | |
| assert not fnt_title.admits(self.spn_text) | |
| assert not fnt_title.admits(self.spn_small_title) | |
| with it("admits correctly with only color"): | |
| fnt_title = FontFilter({ | |
| 'color': 0x000000, | |
| }) | |
| assert fnt_title.admits(self.spn_title) | |
| assert fnt_title.admits(self.spn_text) | |
| assert fnt_title.admits(self.spn_small_title) | |
| with it("rejects correctly with only color"): | |
| fnt_title = FontFilter({ | |
| 'color': 0x000000, | |
| }) | |
| spn_blue = { | |
| 'size': 14.346199989318848, | |
| 'flags': 20, | |
| 'font': 'TZOLRB+CMBX12', | |
| 'color': 0x0000ff, | |
| 'text': 'Section Two', | |
| 'bbox': (157.98439025878906, | |
| 567.3842163085938, | |
| 245.18057250976562, | |
| 581.7447509765625) | |
| } | |
| assert not fnt_title.admits(spn_blue) | |
| with it("admits correctly with only flags"): | |
| fnt_title = FontFilter({ | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }) | |
| assert fnt_title.admits(self.spn_title) | |
| with it("rejects correctly with only flags"): | |
| fnt_title = FontFilter({ | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }) | |
| assert not fnt_title.admits(self.spn_text) | |
| assert not fnt_title.admits(self.spn_small_title) | |
| with it("admits correctly without flags"): | |
| fnt_title = FontFilter({ | |
| 'name': "CMBX12", | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| }) | |
| assert fnt_title.admits(self.spn_title) | |
| with it("rejects correctly without flags"): | |
| fnt_title = FontFilter({ | |
| 'name': "CMBX12", | |
| 'size': 14.346199989318848, | |
| 'size_tolerance': 0, | |
| 'color': 0, | |
| }) | |
| assert not fnt_title.admits(self.spn_text) | |
| assert not fnt_title.admits(self.spn_small_title) | |
| with it("admits correctly with partial flags"): | |
| fnt_title = FontFilter({ | |
| 'serif': True, | |
| 'bold': True | |
| }) | |
| fnt_serif = FontFilter({ | |
| 'serif': True | |
| }) | |
| fnt_sans = FontFilter({ | |
| 'serif': False | |
| }) | |
| fnt_mono = FontFilter({ | |
| 'monospace': True | |
| }) | |
| assert fnt_title.admits(self.spn_title) | |
| assert fnt_serif.admits(self.spn_title) | |
| assert fnt_serif.admits(self.spn_text) | |
| assert fnt_sans.admits({'flags': 0b11011}) | |
| assert fnt_mono.admits({'flags': 0b11111}) | |
| with it("rejects correctly with partial flags"): | |
| fnt_title = FontFilter({ | |
| 'serif': True, | |
| 'bold': True | |
| }) | |
| fnt_serif = FontFilter({ | |
| 'serif': True | |
| }) | |
| fnt_sans = FontFilter({ | |
| 'serif': False | |
| }) | |
| fnt_mono = FontFilter({ | |
| 'monospace': True | |
| }) | |
| assert not fnt_title.admits(self.spn_text) | |
| assert not fnt_title.admits(self.spn_small_title) | |
| assert not fnt_sans.admits(self.spn_title) | |
| assert not fnt_sans.admits(self.spn_text) | |
| assert not fnt_mono.admits(self.spn_title) | |
| assert not fnt_mono.admits(self.spn_text) | |
| with description("BoundingBoxFilter") as self: | |
| with before.all: | |
| self.title_exact = { | |
| 'left': 157.98439025878906, | |
| 'top': 567.3842163085938, | |
| 'right': 245.18057250976562, | |
| 'bottom': 581.7447509765625, | |
| 'tolerance': 0 | |
| } | |
| self.text_exact = { | |
| 'left': 133.76800537109375, | |
| 'top': 592.492919921875, | |
| 'right': 477.537353515625, | |
| 'bottom': 602.4555053710938, | |
| 'tolerance': 0 | |
| } | |
| self.spn_title = { | |
| 'size': 14.346199989318848, | |
| 'flags': 20, | |
| 'font': 'TZOLRB+CMBX12', | |
| 'color': 0, | |
| 'text': 'Section Two', | |
| 'bbox': (157.98439025878906, | |
| 567.3842163085938, | |
| 245.18057250976562, | |
| 581.7447509765625) | |
| } | |
| self.spn_title2 = { | |
| 'size': 14.346199989318848, | |
| 'flags': 20, | |
| 'font': 'TZOLRB+CMBX12', | |
| 'color': 0, | |
| 'text': 'Section One', | |
| 'bbox': (157.98439025878906, | |
| 335.569580078125, | |
| 477.66058349609375, | |
| 349.93011474609375) | |
| } | |
| self.spn_text = { | |
| 'size': 9.962599754333496, | |
| 'flags': 4, | |
| 'font': 'MJDLZY+CMR10', | |
| 'color': 0, | |
| 'text': 'text', | |
| 'bbox': (133.76800537109375, | |
| 592.492919921875, | |
| 477.537353515625, | |
| 602.4555053710938) | |
| } | |
| with it("has a working constructor"): | |
| bbox = BoundingBoxFilter(self.title_exact) | |
| assert bbox.left is not None | |
| assert bbox.right is not None | |
| assert bbox.top is not None | |
| assert bbox.bottom is not None | |
| assert bbox.tolerance == 0 | |
| with it("can construct if empty dict is given in the constructor"): | |
| bbox = BoundingBoxFilter({}) | |
| assert bbox.left is None | |
| assert bbox.right is None | |
| assert bbox.top is None | |
| assert bbox.bottom is None | |
| assert bbox.tolerance == 1e-5 | |
| with it("admits exact matches"): | |
| bbox_title = BoundingBoxFilter(self.title_exact) | |
| bbox_text = BoundingBoxFilter(self.text_exact) | |
| assert bbox_title.admits(self.spn_title) | |
| assert bbox_text.admits(self.spn_text) | |
| with it("rejects unmatched spans"): | |
| bbox_title = BoundingBoxFilter(self.title_exact) | |
| assert not bbox_title.admits(self.spn_text) | |
| assert not bbox_title.admits(self.spn_title2) | |
| bbox_text = BoundingBoxFilter(self.text_exact) | |
| assert not bbox_text.admits(self.spn_title) | |
| assert not bbox_text.admits(self.spn_title2) | |
| with it("admits correctly with partial bbox"): | |
| bbox_title = BoundingBoxFilter({ | |
| 'left': 157.98439025878906 | |
| }) | |
| assert bbox_title.admits(self.spn_title) | |
| assert bbox_title.admits(self.spn_title2) | |
| bbox_top = BoundingBoxFilter({ | |
| 'top': 567.3842163085938 | |
| }) | |
| assert bbox_top.admits(self.spn_title) | |
| bbox_right = BoundingBoxFilter({ | |
| 'right': 245.18057250976562 | |
| }) | |
| assert bbox_right.admits(self.spn_title) | |
| bbox_bottom = BoundingBoxFilter({ | |
| 'bottom': 581.7447509765625 | |
| }) | |
| assert bbox_bottom.admits(self.spn_title) | |
| with it("rejects correctly with partial bbox"): | |
| bbox_title = BoundingBoxFilter({ | |
| 'left': 157.98439025878906 | |
| }) | |
| assert not bbox_title.admits(self.spn_text) | |
| bbox_top = BoundingBoxFilter({ | |
| 'top': 567.3842163085938 | |
| }) | |
| assert not bbox_top.admits(self.spn_title2) | |
| bbox_right = BoundingBoxFilter({ | |
| 'right': 245.18057250976562 | |
| }) | |
| assert not bbox_right.admits(self.spn_title2) | |
| bbox_bottom = BoundingBoxFilter({ | |
| 'bottom': 581.7447509765625 | |
| }) | |
| assert not bbox_bottom.admits(self.spn_title2) | |