Spaces:
Runtime error
Runtime error
| import re | |
| from pathlib import Path | |
| from typing import List | |
| import pandas as pd | |
| from docling_core.types.doc import ( | |
| BoundingBox, | |
| CoordOrigin, | |
| DocItemLabel, | |
| DoclingDocument, | |
| DocumentOrigin, | |
| GroupLabel, | |
| ProvenanceItem, | |
| Size, | |
| TableCell, | |
| TableData, | |
| ) | |
| from docling_core.types.doc.document import ContentLayer | |
| def resolve_item(paths, obj): | |
| """Find item in document from a reference path""" | |
| if len(paths) == 0: | |
| return obj | |
| if paths[0] == "#": | |
| return resolve_item(paths[1:], obj) | |
| try: | |
| key = int(paths[0]) | |
| except: | |
| key = paths[0] | |
| if len(paths) == 1: | |
| if isinstance(key, str) and key in obj: | |
| return obj[key] | |
| elif isinstance(key, int) and key < len(obj): | |
| return obj[key] | |
| else: | |
| return None | |
| elif len(paths) > 1: | |
| if isinstance(key, str) and key in obj: | |
| return resolve_item(paths[1:], obj[key]) | |
| elif isinstance(key, int) and key < len(obj): | |
| return resolve_item(paths[1:], obj[key]) | |
| else: | |
| return None | |
| else: | |
| return None | |
| def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]: | |
| unique_objects = [] | |
| seen_spans = set() | |
| for sublist in grid: | |
| for obj in sublist: | |
| # Convert the spans list to a tuple of tuples for hashing | |
| spans_tuple = tuple(tuple(span) for span in obj["spans"]) | |
| if spans_tuple not in seen_spans: | |
| seen_spans.add(spans_tuple) | |
| unique_objects.append(obj) | |
| return unique_objects | |
| def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: | |
| origin = DocumentOrigin( | |
| mimetype="application/pdf", | |
| filename=doc_glm["file-info"]["filename"], | |
| binary_hash=doc_glm["file-info"]["document-hash"], | |
| ) | |
| doc_name = Path(origin.filename).stem | |
| doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin) | |
| for page_dim in doc_glm["page-dimensions"]: | |
| page_no = int(page_dim["page"]) | |
| size = Size(width=page_dim["width"], height=page_dim["height"]) | |
| doc.add_page(page_no=page_no, size=size) | |
| if "properties" in doc_glm: | |
| props = pd.DataFrame( | |
| doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"] | |
| ) | |
| else: | |
| props = pd.DataFrame() | |
| current_list = None | |
| for ix, pelem in enumerate(doc_glm["page-elements"]): | |
| ptype = pelem["type"] | |
| span_i = pelem["span"][0] | |
| span_j = pelem["span"][1] | |
| if "iref" not in pelem: | |
| # print(json.dumps(pelem, indent=2)) | |
| continue | |
| iref = pelem["iref"] | |
| if re.match("#/figures/(\\d+)/captions/(.+)", iref): | |
| # print(f"skip {iref}") | |
| continue | |
| if re.match("#/tables/(\\d+)/captions/(.+)", iref): | |
| # print(f"skip {iref}") | |
| continue | |
| path = iref.split("/") | |
| obj = resolve_item(path, doc_glm) | |
| if obj is None: | |
| current_list = None | |
| print(f"warning: undefined {path}") | |
| continue | |
| if ptype == "figure": | |
| current_list = None | |
| text = "" | |
| caption_refs = [] | |
| for caption in obj["captions"]: | |
| text += caption["text"] | |
| for nprov in caption["prov"]: | |
| npaths = nprov["$ref"].split("/") | |
| nelem = resolve_item(npaths, doc_glm) | |
| if nelem is None: | |
| # print(f"warning: undefined caption {npaths}") | |
| continue | |
| span_i = nelem["span"][0] | |
| span_j = nelem["span"][1] | |
| cap_text = caption["text"][span_i:span_j] | |
| # doc_glm["page-elements"].remove(nelem) | |
| prov = ProvenanceItem( | |
| page_no=nelem["page"], | |
| charspan=tuple(nelem["span"]), | |
| bbox=BoundingBox.from_tuple( | |
| nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT | |
| ), | |
| ) | |
| caption_obj = doc.add_text( | |
| label=DocItemLabel.CAPTION, text=cap_text, prov=prov | |
| ) | |
| caption_refs.append(caption_obj.get_ref()) | |
| prov = ProvenanceItem( | |
| page_no=pelem["page"], | |
| charspan=(0, len(text)), | |
| bbox=BoundingBox.from_tuple( | |
| pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT | |
| ), | |
| ) | |
| pic = doc.add_picture(prov=prov) | |
| pic.captions.extend(caption_refs) | |
| _add_child_elements(pic, doc, obj, pelem) | |
| elif ptype == "table": | |
| current_list = None | |
| text = "" | |
| caption_refs = [] | |
| item_label = DocItemLabel(pelem["name"]) | |
| for caption in obj["captions"]: | |
| text += caption["text"] | |
| for nprov in caption["prov"]: | |
| npaths = nprov["$ref"].split("/") | |
| nelem = resolve_item(npaths, doc_glm) | |
| if nelem is None: | |
| # print(f"warning: undefined caption {npaths}") | |
| continue | |
| span_i = nelem["span"][0] | |
| span_j = nelem["span"][1] | |
| cap_text = caption["text"][span_i:span_j] | |
| # doc_glm["page-elements"].remove(nelem) | |
| prov = ProvenanceItem( | |
| page_no=nelem["page"], | |
| charspan=tuple(nelem["span"]), | |
| bbox=BoundingBox.from_tuple( | |
| nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT | |
| ), | |
| ) | |
| caption_obj = doc.add_text( | |
| label=DocItemLabel.CAPTION, text=cap_text, prov=prov | |
| ) | |
| caption_refs.append(caption_obj.get_ref()) | |
| table_cells_glm = _flatten_table_grid(obj["data"]) | |
| table_cells = [] | |
| for tbl_cell_glm in table_cells_glm: | |
| if tbl_cell_glm["bbox"] is not None: | |
| bbox = BoundingBox.from_tuple( | |
| tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT | |
| ) | |
| else: | |
| bbox = None | |
| is_col_header = False | |
| is_row_header = False | |
| is_row_section = False | |
| if tbl_cell_glm["type"] == "col_header": | |
| is_col_header = True | |
| elif tbl_cell_glm["type"] == "row_header": | |
| is_row_header = True | |
| elif tbl_cell_glm["type"] == "row_section": | |
| is_row_section = True | |
| table_cells.append( | |
| TableCell( | |
| row_span=tbl_cell_glm["row-span"][1] | |
| - tbl_cell_glm["row-span"][0], | |
| col_span=tbl_cell_glm["col-span"][1] | |
| - tbl_cell_glm["col-span"][0], | |
| start_row_offset_idx=tbl_cell_glm["row-span"][0], | |
| end_row_offset_idx=tbl_cell_glm["row-span"][1], | |
| start_col_offset_idx=tbl_cell_glm["col-span"][0], | |
| end_col_offset_idx=tbl_cell_glm["col-span"][1], | |
| text=tbl_cell_glm["text"], | |
| bbox=bbox, | |
| column_header=is_col_header, | |
| row_header=is_row_header, | |
| row_section=is_row_section, | |
| ) | |
| ) | |
| tbl_data = TableData( | |
| num_rows=obj.get("#-rows", 0), | |
| num_cols=obj.get("#-cols", 0), | |
| table_cells=table_cells, | |
| ) | |
| prov = ProvenanceItem( | |
| page_no=pelem["page"], | |
| charspan=(0, 0), | |
| bbox=BoundingBox.from_tuple( | |
| pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT | |
| ), | |
| ) | |
| tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label) | |
| tbl.captions.extend(caption_refs) | |
| elif ptype in [DocItemLabel.FORM.value, DocItemLabel.KEY_VALUE_REGION.value]: | |
| label = DocItemLabel(ptype) | |
| group_label = GroupLabel.UNSPECIFIED | |
| if label == DocItemLabel.FORM: | |
| group_label = GroupLabel.FORM_AREA | |
| elif label == DocItemLabel.KEY_VALUE_REGION: | |
| group_label = GroupLabel.KEY_VALUE_AREA | |
| container_el = doc.add_group(label=group_label) | |
| _add_child_elements(container_el, doc, obj, pelem) | |
| elif "text" in obj: | |
| text = obj["text"][span_i:span_j] | |
| type_label = pelem["type"] | |
| name_label = pelem["name"] | |
| if update_name_label and len(props) > 0 and type_label == "paragraph": | |
| prop = props[ | |
| (props["type"] == "semantic") & (props["subj_path"] == iref) | |
| ] | |
| if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85: | |
| name_label = prop.iloc[0]["label"] | |
| prov = ProvenanceItem( | |
| page_no=pelem["page"], | |
| charspan=(0, len(text)), | |
| bbox=BoundingBox.from_tuple( | |
| pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT | |
| ), | |
| ) | |
| label = DocItemLabel(name_label) | |
| if label == DocItemLabel.LIST_ITEM: | |
| if current_list is None: | |
| current_list = doc.add_group(label=GroupLabel.LIST, name="list") | |
| # TODO: Infer if this is a numbered or a bullet list item | |
| doc.add_list_item( | |
| text=text, enumerated=False, prov=prov, parent=current_list | |
| ) | |
| elif label == DocItemLabel.SECTION_HEADER: | |
| current_list = None | |
| doc.add_heading(text=text, prov=prov) | |
| elif label == DocItemLabel.CODE: | |
| current_list = None | |
| doc.add_code(text=text, prov=prov) | |
| elif label == DocItemLabel.FORMULA: | |
| current_list = None | |
| doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov) | |
| elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]: | |
| current_list = None | |
| doc.add_text( | |
| label=DocItemLabel(name_label), | |
| text=text, | |
| prov=prov, | |
| content_layer=ContentLayer.FURNITURE, | |
| ) | |
| else: | |
| current_list = None | |
| doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov) | |
| return doc | |
| def _add_child_elements(container_el, doc, obj, pelem): | |
| payload = obj.get("payload") | |
| if payload is not None: | |
| children = payload.get("children", []) | |
| for child in children: | |
| c_label = DocItemLabel(child["label"]) | |
| c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin( | |
| doc.pages[pelem["page"]].size.height | |
| ) | |
| c_text = " ".join( | |
| [ | |
| cell["text"].replace("\x02", "-").strip() | |
| for cell in child["cells"] | |
| if len(cell["text"].strip()) > 0 | |
| ] | |
| ) | |
| c_prov = ProvenanceItem( | |
| page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox | |
| ) | |
| if c_label == DocItemLabel.LIST_ITEM: | |
| # TODO: Infer if this is a numbered or a bullet list item | |
| doc.add_list_item(parent=container_el, text=c_text, prov=c_prov) | |
| elif c_label == DocItemLabel.SECTION_HEADER: | |
| doc.add_heading(parent=container_el, text=c_text, prov=c_prov) | |
| else: | |
| doc.add_text( | |
| parent=container_el, label=c_label, text=c_text, prov=c_prov | |
| ) | |