Spaces:

ArneBinder
/

ScientificArgumentRecommender

Sleeping

App Files Files Community

ArneBinder commited on 8 days ago

Commit

ced4316

verified ·

1 Parent(s): aed01f0

update from https://github.com/ArneBinder/pie-document-level/pull/397

Browse files

Files changed (50) hide show

argumentation_model/_joint.yaml +4 -0
argumentation_model/_pipelined.yaml +17 -0
argumentation_model/joint.yaml +10 -0
argumentation_model/joint_hps.yaml +7 -0
argumentation_model/pipelined.yaml +8 -0
argumentation_model/pipelined_deprecated.yaml +9 -0
argumentation_model/pipelined_hps.yaml +8 -0
argumentation_model/pipelined_new.yaml +14 -0
demo.yaml +85 -0
pdf_fulltext_extractor/grobid_local.yaml +18 -0
pdf_fulltext_extractor/none.yaml +0 -0
requirements.txt +8 -2
retriever/related_span_retriever_with_relations_from_other_docs.yaml +49 -0
src/analysis/__init__.py +0 -0
src/analysis/combine_job_returns.py +169 -0
src/analysis/common.py +47 -0
src/analysis/compare_job_returns.py +407 -0
src/data/acl_anthology_crawler.py +117 -0
src/data/calc_iaa_for_brat.py +272 -0
src/data/construct_sciarg_abstracts_remaining_gold_retrieval.py +238 -0
src/data/prepare_sciarg_crosssection_annotations.py +398 -0
src/data/split_sciarg_abstracts.py +132 -0
src/demo/annotation_utils.py +88 -41
src/demo/backend_utils.py +106 -13
src/demo/frontend_utils.py +12 -0
src/demo/rendering_utils.py +23 -3
src/demo/rendering_utils_displacy.py +12 -1
src/demo/retrieve_and_dump_all_relevant.py +61 -2
src/demo/retriever_utils.py +8 -6
src/document/processing.py +212 -77
src/hydra_callbacks/save_job_return_value.py +178 -40
src/langchain_modules/pie_document_store.py +1 -1
src/langchain_modules/span_retriever.py +13 -16
src/pipeline/ner_re_pipeline.py +45 -15
src/predict.py +6 -2
src/start_demo.py +161 -36
src/train.py +10 -0
src/utils/__init__.py +6 -1
src/utils/config_utils.py +15 -1
src/utils/pdf_utils/README.MD +35 -0
src/utils/pdf_utils/__init__.py +0 -0
src/utils/pdf_utils/acl_anthology_utils.py +77 -0
src/utils/pdf_utils/client.py +193 -0
src/utils/pdf_utils/grobid_client.py +203 -0
src/utils/pdf_utils/grobid_util.py +413 -0
src/utils/pdf_utils/process_pdf.py +276 -0
src/utils/pdf_utils/raw_paper.py +90 -0
src/utils/pdf_utils/s2orc_paper.py +478 -0
src/utils/pdf_utils/s2orc_utils.py +61 -0
src/utils/pdf_utils/utils.py +904 -0

argumentation_model/_joint.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_target_: pytorch_ie.auto.AutoPipeline.from_pretrained
+pretrained_model_name_or_path: ???
+# this batch_size that works good (fastest) on a single RTX2080Ti (11GB) (see https://github.com/ArneBinder/pie-document-level/issues/334#issuecomment-2613232344)
+batch_size: 1

argumentation_model/_pipelined.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+_target_: src.pipeline.NerRePipeline
+ner_model_path: ???
+re_model_path: ???
+entity_layer: labeled_spans
+relation_layer: binary_relations
+# this works good on a single RTX2080Ti (11GB)
+ner_pipeline:
+  batch_size: 256
+re_pipeline:
+  batch_size: 64
+  # convert the RE model to half precision for mixed precision inference (speedup approx. 4x)
+  half_precision_model: true
+  taskmodule_kwargs:
+    # don't show statistics after encoding
+    collect_statistics: false
+# don't show pipeline steps
+verbose: false

argumentation_model/joint.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+defaults:
+  - _joint
+# best model based on the validation set (see https://github.com/ArneBinder/pie-document-level/issues/334#issuecomment-2613232344 for details)
+# i.e. models from https://github.com/ArneBinder/pie-document-level/issues/334#issuecomment-2578422544, but with last checkpoint (instead of best validation checkpoint)
+#   model_name_or_path: models/dataset-sciarg/task-ner_re/v0.4/2025-01-09_01-50-53
+#   ckpt_path: logs/training/multiruns/dataset-sciarg/task-ner_re/v0.4/2025-01-09_01-50-52/2/checkpoints/last.ckpt
+#   w&b run (for the loaded checkpoint): [icy-glitter-5](https://wandb.ai/arne/dataset-sciarg-task-ner_re-v0.4-training/runs/it5toj6w)
+pretrained_model_name_or_path: "ArneBinder/sam-pointer-bart-base-v0.4"
+revision: "0445c69bafa31f8153aaeafc1767fad84919926a"

argumentation_model/joint_hps.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+defaults:
+  - _joint
+# from: hparams_search for all datasets
+# see https://github.com/ArneBinder/pie-document-level/pull/381#issuecomment-2682711151
+# THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
+pretrained_model_name_or_path: models/dataset-sciarg/task-ner_re/2025-02-23_05-16-45

argumentation_model/pipelined.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+defaults:
+  - _pipelined
+# from: train pipeline models with bigger train set,
+# see https://github.com/ArneBinder/pie-document-level/issues/355#issuecomment-2612958658
+# THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
+ner_model_path: models/dataset-sciarg/task-adus/v0.4/2025-01-20_05-50-00
+re_model_path: models/dataset-sciarg/task-relations/v0.4/2025-01-22_20-36-23

argumentation_model/pipelined_deprecated.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+defaults:
+  - _pipelined
+# from: train pipeline models with bigger train set, but with strange choice of models,
+# see edit history of https://github.com/ArneBinder/pie-document-level/issues/355#issuecomment-2612958658
+# NOTE: these were originally in the pipelined.yaml
+# THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
+ner_model_path: models/dataset-sciarg/task-adus/v0.4/2025-01-20_09-09-11
+re_model_path: models/dataset-sciarg/task-relations/v0.4/2025-01-22_12-44-51

argumentation_model/pipelined_hps.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+defaults:
+  - _pipelined
+# from: hparams_search for all datasets,
+# see https://github.com/ArneBinder/pie-document-level/pull/381#issuecomment-2684865102
+# THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
+ner_model_path: models/dataset-sciarg/task-adur/2025-02-26_07-14-59
+re_model_path: models/dataset-sciarg/task-are/2025-02-20_18-09-25

argumentation_model/pipelined_new.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+defaults:
+  - _pipelined
+# from: Update scientific ARE experiment configs,
+# see https://github.com/ArneBinder/pie-document-level/pull/379#issuecomment-2651669398
+# i.e. the models are now on Hugging Face
+# ner_model_path: models/dataset-sciarg/task-adur/2025-02-09_23-08-37
+# re_model_path: models/dataset-sciarg/task-are/2025-02-10_19-24-52
+ner_model_path: ArneBinder/sam-adur-sciarg
+ner_pipeline:
+  revision: bcbef4e585a5f637009ff702661cf824abede6b0
+re_model_path: ArneBinder/sam-are-sciarg
+re_pipeline:
+  revision: 93024388330c58daf20963c2020e08f54553e74c

demo.yaml ADDED Viewed

	@@ -0,0 +1,85 @@

+defaults:
+  - _self_
+  # default retriever, see subfolder retriever for more details
+  - retriever: related_span_retriever_with_relations_from_other_docs
+  # default argumentation model, see subfolder argumentation_model for more details
+  - argumentation_model: pipelined_new
+  # since this requires a running GROBID server, we disable it by default
+  - pdf_fulltext_extractor: none
+# Whether to handle segmented entities in the document. If True, labeled_spans are converted
+# to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
+# This requires the networkx package to be installed.
+handle_parts_of_same: true
+# Split the document text into sections that are processed separately.
+default_split_regex: "\n\n\n+"
+# retriever details (query parameters)
+default_min_similarity: 0.95
+default_top_k: 10
+# data import details
+default_arxiv_id: "1706.03762"
+default_load_pie_dataset_kwargs:
+  path: "pie/sciarg"
+  name: "resolve_parts_of_same"
+  split: "train"
+# set to the data directory of https://github.com/acl-org/acl-anthology
+#   to enable ACL venue PDF import (requires to also have a valid pdf_fulltext_extractor)
+# acl_anthology_data_dir=../acl-anthology/data
+# temporary directory to store downloaded PDFs
+acl_anthology_pdf_dir: "data/acl-anthology/pdf"
+# for better readability in the UI
+render_mode_captions:
+  displacy: "displaCy + highlighted arguments"
+  pretty_table: "Pretty Table"
+layer_caption_mapping:
+  labeled_multi_spans: "adus"
+  binary_relations: "relations"
+  labeled_partitions: "partitions"
+relation_name_mapping:
+  supports_reversed: "supported by"
+  contradicts_reversed: "contradicts"
+default_render_mode: "displacy"
+default_render_kwargs:
+  entity_options:
+    # we need to have the keys as uppercase because the spacy rendering function converts the labels to uppercase
+    colors:
+      OWN_CLAIM: "#009933"
+      BACKGROUND_CLAIM: "#99ccff"
+      DATA: "#993399"
+  colors_hover:
+    selected: "#ffa"
+    # tail options for relationships
+    tail:
+      # green
+      supports: "#9f9"
+      # red
+      contradicts: "#f99"
+      # do not highlight
+      parts_of_same: null
+    head: null # "#faf"
+    other: null
+example_text: >
+  Scholarly Argumentation Mining (SAM) has recently gained attention due to its
+  potential to help scholars with the rapid growth of published scientific literature.
+  It comprises two subtasks: argumentative discourse unit recognition (ADUR) and
+  argumentative relation extraction (ARE), both of which are challenging since they
+  require e.g. the integration of domain knowledge, the detection of implicit statements,
+  and the disambiguation of argument structure.
+  While previous work focused on dataset construction and baseline methods for
+  specific document sections, such as abstract or results, full-text scholarly argumentation
+  mining has seen little progress. In this work, we introduce a sequential pipeline model
+  combining ADUR and ARE for full-text SAM, and provide a first analysis of the
+  performance of pretrained language models (PLMs) on both subtasks.
+  We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best
+  reported result by a large margin (+7% F1). We also present the first results for ARE, and
+  thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals
+  that non-contiguous ADUs as well as the interpretation of discourse connectors pose major
+  challenges and that data annotation needs to be more consistent.

pdf_fulltext_extractor/grobid_local.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+# This requires a running GROBID server. To start the server via Docker, run:
+#   docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
+_target_: src.utils.pdf_utils.process_pdf.GrobidFulltextExtractor
+section_seperator: "\n\n\n"
+paragraph_seperator: "\n\n"
+grobid_config:
+  grobid_server: localhost
+  grobid_port: 8070
+  batch_size: 1000
+  sleep_time: 5
+  generateIDs: false
+  consolidate_header: false
+  consolidate_citations: false
+  include_raw_citations: true
+  include_raw_affiliations: false
+  max_workers: 2
+verbose: false

pdf_fulltext_extractor/none.yaml ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,7 +1,11 @@
 # --------- pytorch-ie --------- #
-pytorch-ie>=0.29.6,<0.32.0
 pie-datasets>=0.10.5,<0.11.0
-pie-modules>=0.14.0,<0.15.0
 # --------- models -------- #
 adapters>=0.1.2,<0.2.0
@@ -17,6 +21,8 @@ qdrant-client>=1.12.0,<2.0.0
 # --------- demo -------- #
 gradio~=5.5.0
 arxiv~=2.1.3
 # --------- hydra --------- #
 hydra-core>=1.3.0

+# -------- dl backend -------- #
+torch==2.0.0
+pytorch-lightning==2.1.2
 # --------- pytorch-ie --------- #
+pytorch-ie>=0.31.4,<0.32.0
 pie-datasets>=0.10.5,<0.11.0
+pie-modules>=0.14.2,<0.15.0
 # --------- models -------- #
 adapters>=0.1.2,<0.2.0
 # --------- demo -------- #
 gradio~=5.5.0
 arxiv~=2.1.3
+# data preparation
+acl-anthology-py>=0.4.3
 # --------- hydra --------- #
 hydra-core>=1.3.0

retriever/related_span_retriever_with_relations_from_other_docs.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+_target_: src.langchain_modules.DocumentAwareSpanRetrieverWithRelations
+symmetric_relations:
+  - contradicts
+reversed_relations_suffix: _reversed
+relation_labels:
+  - supports_reversed
+  - contradicts
+retrieve_from_same_document: false
+retrieve_from_different_documents: true
+pie_document_type:
+  _target_: pie_modules.utils.resolve_type
+  type_or_str: pytorch_ie.documents.TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions
+docstore:
+  _target_: src.langchain_modules.DatasetsPieDocumentStore
+search_kwargs:
+  k: 10
+search_type: similarity_score_threshold
+vectorstore:
+  _target_: src.langchain_modules.QdrantSpanVectorStore
+  embedding:
+    _target_: src.langchain_modules.HuggingFaceSpanEmbeddings
+    model:
+      _target_: src.models.utils.load_model_with_adapter
+      model_kwargs:
+        pretrained_model_name_or_path: allenai/specter2_base
+      adapter_kwargs:
+        adapter_name_or_path: allenai/specter2
+        load_as: proximity
+        source: hf
+    pipeline_kwargs:
+      tokenizer: allenai/specter2_base
+      stride: 64
+      batch_size: 32
+    model_max_length: 512
+  client:
+    _target_: qdrant_client.QdrantClient
+    location: ":memory:"
+  collection_name: adus
+  vector_params:
+    distance:
+      _target_: qdrant_client.http.models.Distance
+      value: Cosine
+  label_mapping:
+    background_claim:
+      - background_claim
+      - own_claim
+    own_claim:
+      - background_claim
+      - own_claim

src/analysis/__init__.py ADDED Viewed

File without changes

src/analysis/combine_job_returns.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import pyrootutils
+root = pyrootutils.setup_root(
+    search_from=__file__,
+    indicator=[".project-root"],
+    pythonpath=True,
+    dotenv=False,
+)
+import argparse
+import os
+import pandas as pd
+from src.analysis.common import read_nested_jsons
+def separate_path_and_id(path_and_maybe_id: str, separator: str = ":") -> tuple[str | None, str]:
+    parts = path_and_maybe_id.split(separator, 1)
+    if len(parts) == 1:
+        return None, parts[0]
+    return parts[0], parts[1]
+def get_file_paths(paths_file: str, file_name: str, use_aggregated: bool) -> dict[str, str]:
+    with open(paths_file, "r") as f:
+        paths_maybe_with_ids = f.readlines()
+    ids, paths = zip(*[separate_path_and_id(path.strip()) for path in paths_maybe_with_ids])
+    if use_aggregated:
+        file_base_name, ext = os.path.splitext(file_name)
+        file_name = f"{file_base_name}.aggregated{ext}"
+    file_paths = [os.path.join(path, file_name) for path in paths]
+    return {
+        id if id is not None else f"idx={idx}": path
+        for idx, (id, path) in enumerate(zip(ids, file_paths))
+    }
+def main(
+    paths_file: str,
+    file_name: str,
+    use_aggregated: bool,
+    columns: list[str] | None,
+    round_precision: int | None,
+    format: str,
+    transpose: bool = False,
+    unpack_multirun_results: bool = False,
+    in_percent: bool = False,
+):
+    file_paths = get_file_paths(
+        paths_file=paths_file, file_name=file_name, use_aggregated=use_aggregated
+    )
+    data = read_nested_jsons(json_paths=file_paths)
+    if columns is not None:
+        columns_multi_index = [tuple(col.split("/")) for col in columns]
+        try:
+            data_series = [data[col] for col in columns_multi_index]
+        except KeyError as e:
+            print(
+                f"Columns {columns_multi_index} not found in the data. Available columns are {list(data.columns)}."
+            )
+            raise e
+        data = pd.concat(data_series, axis=1)
+    # drop rows that are all NaN
+    data = data.dropna(how="all")
+    # if more than one data point, drop the index levels that are everywhere the same
+    if len(data) > 1:
+        unique_levels = [
+            idx
+            for idx, level in enumerate(data.index.levels)
+            if len(data.index.get_level_values(idx).unique()) == 1
+        ]
+        for level in sorted(unique_levels, reverse=True):
+            data.index = data.index.droplevel(level)
+    # if more than one column, drop the columns that are everywhere the same
+    if len(data.columns) > 1:
+        unique_column_levels = [
+            idx
+            for idx, level in enumerate(data.columns.levels)
+            if len(data.columns.get_level_values(idx).unique()) == 1
+        ]
+        for level in sorted(unique_column_levels, reverse=True):
+            data.columns = data.columns.droplevel(level)
+    if unpack_multirun_results:
+        index_names = list(data.index.names)
+        data_series_lists = data.unstack()
+        data = pd.DataFrame.from_records(
+            data_series_lists.values, index=data_series_lists.index
+        ).stack()
+        for _, index_name in enumerate(index_names):
+            data = data.unstack(index_name)
+        data = data.T
+    if transpose:
+        data = data.T
+    # needs to happen before rounding, otherwise the rounding will be off
+    if in_percent:
+        data = data * 100
+    if round_precision is not None:
+        data = data.round(round_precision)
+    if format == "markdown":
+        print(data.to_markdown())
+    elif format == "markdown_mean_and_std":
+        if transpose:
+            data = data.T
+        if "mean" not in data.columns or "std" not in data.columns:
+            raise ValueError("Columns 'mean' and 'std' are required for this format.")
+        # create a single column with mean and std in the format: mean ± std
+        data = pd.DataFrame(
+            data["mean"].astype(str) + " ± " + data["std"].astype(str), columns=["mean ± std"]
+        )
+        if transpose:
+            data = data.T
+        print(data.to_markdown())
+    elif format == "json":
+        print(data.to_json())
+    else:
+        raise ValueError(f"Invalid format: {format}. Use 'markdown' or 'json'.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Combine job returns and show as Markdown table")
+    parser.add_argument(
+        "--paths-file", type=str, help="Path to the file containing the paths to the job returns"
+    )
+    parser.add_argument(
+        "--use-aggregated", action="store_true", help="Whether to use the aggregated job returns"
+    )
+    parser.add_argument(
+        "--file-name",
+        type=str,
+        default="job_return_value.json",
+        help="Name of the file to write the aggregated job returns to",
+    )
+    parser.add_argument(
+        "--columns", type=str, nargs="+", help="Columns to select from the combined job returns"
+    )
+    parser.add_argument(
+        "--unpack-multirun-results", action="store_true", help="Unpack multirun results"
+    )
+    parser.add_argument("--transpose", action="store_true", help="Transpose the table")
+    parser.add_argument(
+        "--round-precision",
+        type=int,
+        help="Round the values in the combined job returns to the specified precision",
+    )
+    parser.add_argument(
+        "--in-percent", action="store_true", help="Show the values in percent (multiply by 100)"
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        default="markdown",
+        choices=["markdown", "markdown_mean_and_std", "json"],
+        help="Format to output the combined job returns",
+    )
+    args = parser.parse_args()
+    kwargs = vars(args)
+    main(**kwargs)

src/analysis/common.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import json
+from typing import Dict, List, Optional
+import pandas as pd
+def parse_identifier(
+    identifier_str, defaults: Dict[str, str], parts_sep: str = ",", key_val_sep: str = "="
+) -> Dict[str, str]:
+    parts = [
+        part.split(key_val_sep)
+        for part in identifier_str.strip().split(parts_sep)
+        if key_val_sep in part
+    ]
+    parts_dict = dict(parts)
+    return {**defaults, **parts_dict}
+def read_nested_json(path: str) -> pd.DataFrame:
+    # Read the nested JSON data into a pandas DataFrame
+    with open(path, "r") as f:
+        data = json.load(f)
+    result = pd.json_normalize(data, sep="/")
+    result.index.name = "entry"
+    return result
+def read_nested_jsons(
+    json_paths: Dict[str, str],
+    default_key_values: Optional[Dict[str, str]] = None,
+    column_level_names: Optional[List[str]] = None,
+) -> pd.DataFrame:
+    identifier_strings = json_paths.keys()
+    dfs = [read_nested_json(json_paths[identifier_str]) for identifier_str in identifier_strings]
+    new_index_levels = pd.MultiIndex.from_frame(
+        pd.DataFrame(
+            [
+                parse_identifier(identifier_str, default_key_values or {})
+                for identifier_str in identifier_strings
+            ]
+        )
+    )
+    dfs_concat = pd.concat(dfs, keys=list(new_index_levels), names=new_index_levels.names, axis=0)
+    dfs_concat.columns = pd.MultiIndex.from_tuples(
+        [col.split("/") for col in dfs_concat.columns], names=column_level_names
+    )
+    return dfs_concat

src/analysis/compare_job_returns.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import pyrootutils
+root = pyrootutils.setup_root(
+    search_from=__file__,
+    indicator=[".project-root"],
+    pythonpath=True,
+    dotenv=False,
+)
+import argparse
+import io
+import re
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import pandas as pd
+import plotly.graph_objects as go
+from src.analysis.common import parse_identifier, read_nested_jsons
+def read_markdown_table(
+    markdown_data: str,
+    default_key_values: Optional[Dict[str, str]] = None,
+    column_level_names: Optional[List[str]] = None,
+) -> pd.DataFrame:
+    # Read the markdown data into a pandas DataFrame
+    df = pd.read_csv(io.StringIO(markdown_data), sep="|", engine="python", skiprows=1)
+    # Clean up the DataFrame
+    # drop the first and last columns
+    df = df.drop(columns=[df.columns[0], df.columns[-1]])
+    # drop the first row
+    df = df.drop(0)
+    # make the index from the first column: parse the string and extract the values
+    df.index = pd.MultiIndex.from_tuples(
+        [tuple(x.strip()[2:-2].split("', '")) for x in df[df.columns[0]]]
+    )
+    # drop the first column
+    df = df.drop(columns=[df.columns[0]])
+    # parse the column names and create a MultiIndex
+    columns = pd.DataFrame(
+        [parse_identifier(col, defaults=default_key_values or {}) for col in df.columns]
+    )
+    df.columns = pd.MultiIndex.from_frame(columns)
+    # Function to parse the values and errors
+    def parse_value_error(value_error_str: str) -> Tuple[float, float]:
+        match = re.match(r"([0-9.]+) \(?± ?([0-9.]+)\)?", value_error_str.strip())
+        if match:
+            return float(match.group(1)), float(match.group(2))
+        raise ValueError(f"Invalid value error string: {value_error_str}")
+    df_mean_and_std_cells = df.map(lambda x: parse_value_error(x))
+    # make a new DataFrame with the mean and std values as new rows
+    result = pd.concat(
+        {
+            "mean": df_mean_and_std_cells.map(lambda x: x[0]),
+            "std": df_mean_and_std_cells.map(lambda x: x[1]),
+        },
+        axis=0,
+    )
+    # transpose the DataFrame
+    result = result.T
+    # move new column index level to the most inner level
+    result.columns = pd.MultiIndex.from_tuples(
+        [col[1:] + (col[0],) for col in result.columns], names=column_level_names
+    )
+    return result
+def rearrange_for_plotting(
+    data: Union[pd.DataFrame, pd.Series], x_axis: str, x_is_numeric: bool
+) -> pd.DataFrame:
+    # rearrange the DataFrame for plotting
+    while not isinstance(data, pd.Series):
+        data = data.unstack()
+    result = data.unstack(x_axis)
+    if x_is_numeric:
+        result.columns = result.columns.astype(float)
+    return result
+# Function to create plots
+def create_plot(
+    title,
+    x_axis: str,
+    data: pd.DataFrame,
+    data_err: Optional[pd.DataFrame] = None,
+    x_is_numeric: bool = False,
+    marker_getter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+    line_getter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+):
+    data = rearrange_for_plotting(data, x_axis, x_is_numeric)
+    # sort the columns by the x_axis values
+    data = data[data.columns.sort_values()]
+    if data_err is not None:
+        data_err = rearrange_for_plotting(data_err, x_axis, x_is_numeric)
+        data_err = data_err[data.columns]
+    fig = go.Figure()
+    for trace_idx, row_data_mean in data.iterrows():
+        trace_meta = dict(zip(data.index.names, trace_idx))
+        if data_err is not None:
+            error_y = dict(type="data", array=data_err.loc[trace_idx])
+        else:
+            error_y = None
+        fig.add_trace(
+            go.Scatter(
+                x=row_data_mean.index,
+                y=row_data_mean,
+                error_y=error_y,
+                mode="lines+markers",
+                marker=marker_getter(trace_meta) if marker_getter is not None else None,
+                line=line_getter(trace_meta) if line_getter is not None else None,
+                name=", ".join(trace_idx),
+            )
+        )
+    fig.update_layout(title=title, xaxis_title=x_axis, yaxis_title="Values")
+    fig.show()
+def prepare_for_markdown(
+    df: pd.DataFrame,
+    aggregation_column_level: Optional[str] = None,
+    round_precision: Optional[int] = None,
+) -> pd.DataFrame:
+    result = df.copy()
+    # simplify index: create single index from all levels in the format "level1_name=level1_val,level2_name=level2_val,..."
+    if isinstance(result.index, pd.MultiIndex):
+        result.index = [
+            ",".join([f"{name}={val}" for name, val in zip(result.index.names, idx)])
+            for idx in result.index
+        ]
+    else:
+        result.index = [f"{result.index.name}={idx}" for idx in result.index]
+    result = result.T
+    if round_precision is not None:
+        result = result.round(round_precision)
+    if aggregation_column_level is not None:
+        result_mean = result.xs("mean", level=aggregation_column_level, axis="index")
+        result_std = result.xs("std", level=aggregation_column_level, axis="index")
+        # combine each cell with mean and std into a single string
+        result = pd.DataFrame(
+            {
+                col: [f"{mean} (±{std})" for mean, std in zip(result_mean[col], result_std[col])]
+                for col in result_mean.columns
+            },
+            index=result_mean.index,
+        )
+    return result
+def combine_job_returns_and_plot(
+    x_axis: str,
+    plot_column_level: str,
+    job_return_paths: Optional[Dict[str, str]] = None,
+    markdown_str: Optional[str] = None,
+    default_key_values: Optional[Dict[str, str]] = None,
+    column_level_names: Optional[List[str]] = None,
+    drop_columns: Optional[Dict[str, str]] = None,
+    aggregation_column_level: Optional[str] = None,
+    title_prefix: Optional[str] = None,
+    x_is_not_numeric: bool = False,
+    show_as: str = "plot",
+    markdown_round_precision: Optional[int] = None,
+    marker_getter: Optional[Callable] = None,
+    line_getter: Optional[Callable] = None,
+    # placeholder to allow description in CONFIGS
+    description: Optional[str] = None,
+):
+    if job_return_paths is not None:
+        df_all = read_nested_jsons(
+            json_paths=job_return_paths,
+            default_key_values=default_key_values,
+            column_level_names=column_level_names,
+        )
+    elif markdown_str is not None:
+        df_all = read_markdown_table(
+            markdown_data=markdown_str,
+            default_key_values=default_key_values,
+            column_level_names=column_level_names,
+        )
+    else:
+        raise ValueError("Either job_return_paths or markdown_str must be provided")
+    for metric, value in (drop_columns or {}).items():
+        df_all = df_all.drop(columns=value, level=metric)
+    # drop index levels where all values are the same
+    index_levels_to_drop = [
+        i
+        for i in range(df_all.index.nlevels)
+        if len(df_all.index.get_level_values(i).unique()) == 1
+    ]
+    dropped_index_levels = {
+        df_all.index.names[i]: df_all.index.get_level_values(i).unique()[0]
+        for i in index_levels_to_drop
+    }
+    if len(index_levels_to_drop) > 0:
+        print(f"Drop index levels: {dropped_index_levels}")
+    df_all = df_all.droplevel(index_levels_to_drop, axis="index")
+    # drop column levels where all values are the same
+    column_levels_to_drop = [
+        i
+        for i in range(df_all.columns.nlevels)
+        if len(df_all.columns.get_level_values(i).unique()) == 1
+    ]
+    dropped_column_levels = {
+        df_all.columns.names[i]: df_all.columns.get_level_values(i).unique()[0]
+        for i in column_levels_to_drop
+    }
+    if len(column_levels_to_drop) > 0:
+        print(f"Drop column levels: {dropped_column_levels}")
+    df_all = df_all.droplevel(column_levels_to_drop, axis="columns")
+    if show_as == "markdown":
+        print(
+            prepare_for_markdown(
+                df_all,
+                aggregation_column_level=aggregation_column_level,
+                round_precision=markdown_round_precision,
+            ).to_markdown()
+        )
+    elif show_as == "plots":
+        # create plots for each "average" value, i.e. MACRO, MICRO, but also label specific values
+        plot_names = df_all.columns.get_level_values(plot_column_level).unique()
+        for plot_name in plot_names:
+            data_plot = df_all.xs(plot_name, level=plot_column_level, axis="columns")
+            data_err = None
+            if aggregation_column_level is not None:
+                data_err = data_plot.xs("std", level=aggregation_column_level, axis="columns")
+                data_plot = data_plot.xs("mean", level=aggregation_column_level, axis="columns")
+            # Create plot for MACRO values
+            if title_prefix is not None:
+                plot_name = f"{title_prefix}: {plot_name}"
+            create_plot(
+                title=plot_name,
+                data=data_plot,
+                data_err=data_err,
+                x_axis=x_axis,
+                x_is_numeric=not x_is_not_numeric,
+                marker_getter=marker_getter,
+                line_getter=line_getter,
+            )
+    else:
+        raise ValueError(f"Invalid show_as: {show_as}")
+CONFIGS = {
+    "joint model (adus) - last vs best val checkpoint @test": dict(
+        job_return_paths={
+            "epochs=75": "logs/document_evaluation/multiruns/default/2025-01-12_13-28-54/job_return_value.aggregated.json",
+            "epochs=75,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_18-36-35/job_return_value.aggregated.json",
+            "epochs=150": "logs/document_evaluation/multiruns/default/2025-01-15_16-02-04/job_return_value.aggregated.json",
+            "epochs=150,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_22-07-14/job_return_value.aggregated.json",
+            "epochs=300": "logs/document_evaluation/multiruns/default/2025-01-16_18-50-43/job_return_value.aggregated.json",
+            "epochs=300,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_23-12-52/job_return_value.aggregated.json",
+        },
+        x_axis="epochs",
+        default_key_values={"checkpoint": "best_val"},
+        description="data from https://github.com/ArneBinder/pie-document-level/issues/334",
+    ),
+    "joint model (relations) - last vs best val checkpoint @test": dict(
+        job_return_paths={
+            "epochs=75": "logs/document_evaluation/multiruns/default/2025-01-12_13-30-25/job_return_value.aggregated.json",
+            "epochs=75,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_18-38-55/job_return_value.aggregated.json",
+            "epochs=150": "logs/document_evaluation/multiruns/default/2025-01-15_13-32-33/job_return_value.aggregated.json",
+            "epochs=150,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_22-08-43/job_return_value.aggregated.json",
+            "epochs=300": "logs/document_evaluation/multiruns/default/2025-01-11_16-42-17/job_return_value.aggregated.json",
+            "epochs=300,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_23-14-13/job_return_value.aggregated.json",
+        },
+        x_axis="epochs",
+        default_key_values={"checkpoint": "best_val"},
+        description="data from https://github.com/ArneBinder/pie-document-level/issues/334",
+    ),
+    "joint model (adus) - last vs best val checkpoint @val": dict(
+        job_return_paths={
+            "epochs=75": "logs/document_evaluation/multiruns/default/2025-01-17_17-13-46/job_return_value.aggregated.json",
+            "epochs=75,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-17_22-26-17/job_return_value.aggregated.json",
+            "epochs=150": "logs/document_evaluation/multiruns/default/2025-01-17_19-41-17/job_return_value.aggregated.json",
+            "epochs=150,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-17_22-40-54/job_return_value.aggregated.json",
+            "epochs=300": "logs/document_evaluation/multiruns/default/2025-01-17_20-00-01/job_return_value.aggregated.json",
+            "epochs=300,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-17_22-51-51/job_return_value.aggregated.json",
+        },
+        x_axis="epochs",
+        default_key_values={"checkpoint": "best_val"},
+        description="data from https://github.com/ArneBinder/pie-document-level/issues/334",
+    ),
+    "joint model (relations) - last vs best val checkpoint @val": dict(
+        job_return_paths={
+            "epochs=75": "logs/document_evaluation/multiruns/default/2025-01-17_17-16-01/job_return_value.aggregated.json",
+            "epochs=75,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-17_22-28-24/job_return_value.aggregated.json",
+            "epochs=150": "logs/document_evaluation/multiruns/default/2025-01-17_19-42-59/job_return_value.aggregated.json",
+            "epochs=150,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-17_22-41-19/job_return_value.aggregated.json",
+            "epochs=300": "logs/document_evaluation/multiruns/default/2025-01-17_20-01-16/job_return_value.aggregated.json",
+            "epochs=300,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-17_22-52-16/job_return_value.aggregated.json",
+        },
+        x_axis="epochs",
+        default_key_values={"checkpoint": "best_val"},
+        description="data from https://github.com/ArneBinder/pie-document-level/issues/334",
+    ),
+    "joint model (adus) - 27 vs 31 train docs @test": dict(
+        job_return_paths={
+            "num_train_docs=27,epochs=75,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_18-36-35/job_return_value.aggregated.json",
+            "num_train_docs=31,epochs=75,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_11-35-12/job_return_value.aggregated.json",
+        },
+        x_axis="num_train_docs",
+        description="data from https://github.com/ArneBinder/pie-document-level/issues/334",
+    ),
+    "joint model (relations) - 27 vs 31 train docs @test": dict(
+        job_return_paths={
+            "num_train_docs=27,epochs=75,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_18-38-55/job_return_value.aggregated.json",
+            "num_train_docs=31,epochs=75,checkpoint=last": "logs/document_evaluation/multiruns/default/2025-01-16_11-36-52/job_return_value.aggregated.json",
+        },
+        x_axis="num_train_docs",
+        description="data from https://github.com/ArneBinder/pie-document-level/issues/334",
+    ),
+}
+DEFAULT_KWARGS = {
+    "column_level_names": ["split", "average", "metric", "aggr"],
+    "plot_column_level": "average",
+    "x_is_not_numeric": False,
+    "aggregation_column_level": "aggr",
+    "drop_columns": {"metric": "s"},
+    "show_as": "plots",
+    "markdown_round_precision": 3,
+}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Compare multiple job results for predefined setups (see positional choice argument) "
+        "by creating plots or a markdown table."
+    )
+    parser.add_argument(
+        "config",
+        type=str,
+        help="config name (will also be the title prefix)",
+        choices=CONFIGS.keys(),
+    )
+    parser.add_argument(
+        "--column-level-names",
+        type=lambda x: x.strip().split(","),
+        help="comma separated list of column level names. Note that column levels are "
+        "created for each nesting level in the JSON data",
+    )
+    parser.add_argument("--plot-column-level", type=str, help="column level to create plots for")
+    parser.add_argument("--x-axis", type=str, help="column level to use as x-axis")
+    parser.add_argument("--x-is-not-numeric", help="set if x-axis is not numeric")
+    parser.add_argument(
+        "--aggregation-column-level",
+        type=str,
+        help="column level that contains the aggregation type (e.g. mean, std)",
+    )
+    parser.add_argument(
+        "--drop-columns",
+        type=lambda x: dict(part.split(":") for part in x.strip().split(",")),
+        help="a comma separated list of key-value pairs in the format level_name=level_value to "
+        "drop columns with the specific level values",
+    )
+    parser.add_argument(
+        "--show-as",
+        type=str,
+        help="show the data as 'plots' or 'markdown'",
+    )
+    parser.add_argument(
+        "--markdown-round-precision", type=int, help="round precision for show markdown"
+    )
+    args = parser.parse_args()
+    user_kwargs = vars(args)
+    config_name = user_kwargs.pop("config")
+    def get_marker(trace_meta):
+        checkpoint2marker_style = {"best_val": "circle-open", "last": "x"}
+        return dict(symbol=checkpoint2marker_style[trace_meta.get("checkpoint", "last")], size=12)
+    def get_line(trace_meta):
+        metric2checkpoint2color = {
+            "f1": {"best_val": "lightblue", "last": "blue"},
+            "f": {"best_val": "lightblue", "last": "blue"},
+            "p": {"best_val": "lightgreen", "last": "green"},
+            "r": {"best_val": "lightcoral", "last": "red"},
+        }
+        return dict(
+            color=metric2checkpoint2color[trace_meta["metric"]][
+                trace_meta.get("checkpoint", "last")
+            ],
+            width=2,
+        )
+    kwargs = {
+        "title_prefix": config_name,
+        "line_getter": get_line,
+        "marker_getter": get_marker,
+        **DEFAULT_KWARGS,
+        **CONFIGS[config_name],
+    }
+    for key, value in user_kwargs.items():
+        if value is not None:
+            kwargs[key] = value
+    combine_job_returns_and_plot(**kwargs)

src/data/acl_anthology_crawler.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import pyrootutils
+root = pyrootutils.setup_root(
+    search_from=__file__,
+    indicator=[".project-root"],
+    pythonpath=True,
+    dotenv=True,
+)
+import os
+from argparse import ArgumentParser, RawTextHelpFormatter
+from dataclasses import dataclass, field
+from pathlib import Path
+from acl_anthology import Anthology
+from tqdm import tqdm
+from src.utils.pdf_utils.acl_anthology_utils import XML2RawPapers
+from src.utils.pdf_utils.process_pdf import (
+    FulltextExtractor,
+    GrobidFulltextExtractor,
+    PDFDownloader,
+)
+HELP_MSG = """
+Generate paper json files from an ACL Anthology collection, with fulltext extraction.
+Iterate over entries in the ACL Anthology metadata, and for each entry:
+1. extract relevant paper info from the xml entry
+2. download pdf file
+3. extract fulltext
+4. format a json file and save
+pre-requisites:
+- Install the requirements: pip install acl-anthology-py>=0.4.3 bs4 jsonschema
+- Get the meta data from ACL Anthology: git clone git@github.com:acl-org/acl-anthology.git
+- Start Grobid Docker container: docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
+"""
+@dataclass
+class XML2Jsons:
+    base_output_dir: Path
+    pdf_output_dir: Path
+    xml2raw_papers: XML2RawPapers
+    pdf_downloader: PDFDownloader = field(default_factory=PDFDownloader)
+    fulltext_extractor: FulltextExtractor = field(default_factory=GrobidFulltextExtractor)
+    show_progress: bool = True
+    @classmethod
+    def from_cli(cls) -> "XML2Jsons":
+        parser = ArgumentParser(description=HELP_MSG, formatter_class=RawTextHelpFormatter)
+        parser.add_argument(
+            "--base-output-dir", type=str, help="Directory to save all the paper json files"
+        )
+        parser.add_argument(
+            "--pdf-output-dir", type=str, help="Directory to save all the downloaded pdf files"
+        )
+        parser.add_argument(
+            "--anthology-data-dir",
+            type=str,
+            help="Path to ACL Anthology metadata directory, e.g., /path/to/acl-anthology-repo/data. "
+            "You can obtain the data via: git clone git@github.com:acl-org/acl-anthology.git",
+        )
+        parser.add_argument(
+            "--collection-id-filters",
+            nargs="+",
+            type=str,
+            default=None,
+            help="If provided, only papers from the collections whose id (Anthology ID) contains the "
+            "specified strings will be processed.",
+        )
+        parser.add_argument(
+            "--venue-id-whitelist",
+            nargs="+",
+            type=str,
+            default=None,
+            help="If provided, only papers from the specified venues will be processed. See here for "
+            "the list of venues: https://aclanthology.org/venues",
+        )
+        args = parser.parse_args()
+        return cls(
+            base_output_dir=Path(args.base_output_dir),
+            pdf_output_dir=Path(args.pdf_output_dir),
+            xml2raw_papers=XML2RawPapers(
+                anthology=Anthology(datadir=args.anthology_data_dir),
+                collection_id_filters=args.collection_id_filters,
+                venue_id_whitelist=args.venue_id_whitelist,
+            ),
+        )
+    def run(self):
+        os.makedirs(self.pdf_output_dir, exist_ok=True)
+        papers = self.xml2raw_papers()
+        if self.show_progress:
+            papers = tqdm(list(papers), desc="extracting fulltext")
+        for paper in papers:
+            volume_dir = self.base_output_dir / paper.volume_id
+            if paper.url is not None:
+                pdf_save_path = self.pdf_downloader.download(
+                    paper.url, opath=self.pdf_output_dir / f"{paper.name}.pdf"
+                )
+                fulltext_extraction_output = self.fulltext_extractor(pdf_save_path)
+                if fulltext_extraction_output:
+                    plain_text, extraction_data = fulltext_extraction_output
+                    paper.fulltext = extraction_data.get("sections")
+                    if not paper.abstract:
+                        paper.abstract = extraction_data.get("abstract")
+                paper.save(str(volume_dir))
+if __name__ == "__main__":
+    xml2jsons = XML2Jsons.from_cli()
+    xml2jsons.run()

src/data/calc_iaa_for_brat.py ADDED Viewed

	@@ -0,0 +1,272 @@

+from collections.abc import Iterable
+import pyrootutils
+from pytorch_ie import Document
+root = pyrootutils.setup_root(
+    search_from=__file__,
+    indicator=[".project-root"],
+    pythonpath=True,
+    dotenv=True,
+)
+import argparse
+from functools import partial
+from typing import Callable, List, Optional, Union
+import pandas as pd
+from pie_datasets import Dataset, load_dataset
+from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans
+from pie_modules.document.processing import RelationArgumentSorter, SpansViaRelationMerger
+from pytorch_ie.metrics import F1Metric
+from src.document.processing import align_predicted_span_annotations
+def add_annotations_as_predictions(document: BratDocument, other: BratDocument) -> BratDocument:
+    document = document.copy()
+    other = other.copy()
+    document.spans.predictions.extend(other.spans.clear())
+    gold2gold_span_mapping = {span: span for span in document.spans}
+    predicted2maybe_gold_span = {}
+    for span in document.spans.predictions:
+        predicted2maybe_gold_span[span] = gold2gold_span_mapping.get(span, span)
+    predicted_relations = [
+        rel.copy(
+            head=predicted2maybe_gold_span[rel.head], tail=predicted2maybe_gold_span[rel.tail]
+        )
+        for rel in other.relations.clear()
+    ]
+    document.relations.predictions.extend(predicted_relations)
+    return document
+def remove_annotations_existing_in_other(
+    document: BratDocumentWithMergedSpans, other: BratDocumentWithMergedSpans
+) -> BratDocumentWithMergedSpans:
+    result = document.copy(with_annotations=False)
+    document = document.copy()
+    other = other.copy()
+    spans = set(document.spans.clear()) - set(other.spans.clear())
+    relations = set(document.relations.clear()) - set(other.relations.clear())
+    result.spans.extend(spans)
+    result.relations.extend(relations)
+    return result
+def unnest_dict(d):
+    result = {}
+    for key, value in d.items():
+        if isinstance(value, dict):
+            unnested = unnest_dict(value)
+            for k, v in unnested.items():
+                result[(key,) + k] = v
+        else:
+            result[(key,)] = value
+    return result
+def calc_brat_iaas(
+    annotator_dirs: List[str],
+    ignore_annotation_dir: Optional[str] = None,
+    combine_fragmented_spans_via_relation: Optional[str] = None,
+    sort_arguments_of_relations: Optional[List[str]] = None,
+    align_spans: bool = False,
+    show_results: bool = False,
+    per_file: bool = False,
+) -> Union[pd.Series, List[pd.Series]]:
+    if len(annotator_dirs) < 2:
+        raise ValueError("At least two annotation dirs must be provided")
+    span_aligner = None
+    if align_spans:
+        span_aligner = partial(align_predicted_span_annotations, span_layer="spans")
+    if combine_fragmented_spans_via_relation is not None:
+        print(f"Combine fragmented spans via {combine_fragmented_spans_via_relation} relations")
+        merger = SpansViaRelationMerger(
+            relation_layer="relations",
+            link_relation_label=combine_fragmented_spans_via_relation,
+            create_multi_spans=True,
+            result_document_type=BratDocument,
+            result_field_mapping={"spans": "spans", "relations": "relations"},
+        )
+    else:
+        merger = None
+    if sort_arguments_of_relations is not None and len(sort_arguments_of_relations) > 0:
+        print(f"Sort arguments of relations with labels {sort_arguments_of_relations}")
+        relation_argument_sorter = RelationArgumentSorter(
+            relation_layer="relations",
+            label_whitelist=sort_arguments_of_relations,  # ["parts_of_same", "semantically_same", "contradicts"],
+        )
+    else:
+        relation_argument_sorter = None
+    all_docs = [
+        load_dataset(
+            "pie/brat",
+            name="merge_fragmented_spans",
+            base_dataset_kwargs=dict(data_dir=annotation_dir),
+            split="train",
+        ).map(lambda doc: doc.deduplicate_annotations())
+        for annotation_dir in annotator_dirs
+    ]
+    if ignore_annotation_dir is not None:
+        print(f"Ignoring annotations loaded from {ignore_annotation_dir}")
+        ignore_annotation_docs = load_dataset(
+            "pie/brat",
+            name="merge_fragmented_spans",
+            base_dataset_kwargs=dict(data_dir=ignore_annotation_dir),
+            split="train",
+        )
+        ignore_annotation_docs_dict = {doc.id: doc for doc in ignore_annotation_docs}
+        all_docs = [
+            docs.map(
+                lambda doc: remove_annotations_existing_in_other(
+                    doc, other=ignore_annotation_docs_dict[doc.id]
+                )
+            )
+            for docs in all_docs
+        ]
+    if relation_argument_sorter is not None:
+        all_docs = [docs.map(relation_argument_sorter) for docs in all_docs]
+    if per_file:
+        results_per_doc = []
+        for docs_tuple in zip(*all_docs):
+            if show_results:
+                print(f"\ncalculate scores for document id={docs_tuple[0].id} ...")
+            docs = [Dataset.from_documents([doc]) for doc in docs_tuple]
+            result_per_doc = calc_brat_iaas_for_docs(
+                docs, span_aligner=span_aligner, merger=merger, show_results=show_results
+            )
+            results_per_doc.append(result_per_doc)
+        return results_per_doc
+    else:
+        return calc_brat_iaas_for_docs(
+            all_docs, span_aligner=span_aligner, merger=merger, show_results=show_results
+        )
+def calc_brat_iaas_for_docs(
+    all_docs: List[Dataset],
+    span_aligner: Optional[Callable] = None,
+    merger: Optional[Callable] = None,
+    show_results: bool = False,
+) -> pd.Series:
+    num_annotators = len(all_docs)
+    all_docs_dict = [{doc.id: doc for doc in docs} for docs in all_docs]
+    gold_predicted = {}
+    for gold_annotator_idx in range(num_annotators):
+        gold = all_docs[gold_annotator_idx]
+        for predicted_annotator_idx in range(num_annotators):
+            if gold_annotator_idx == predicted_annotator_idx:
+                continue
+            predicted_dict = all_docs_dict[predicted_annotator_idx]
+            gold_predicted[(gold_annotator_idx, predicted_annotator_idx)] = gold.map(
+                lambda doc: add_annotations_as_predictions(doc, other=predicted_dict[doc.id])
+            )
+    spans_metric = F1Metric(layer="spans", labels="INFERRED", show_as_markdown=True)
+    relations_metric = F1Metric(layer="relations", labels="INFERRED", show_as_markdown=True)
+    metric_values = {}
+    for gold_annotator_idx, predicted_annotator_idx in gold_predicted:
+        print(
+            f"calculate scores for annotations {gold_annotator_idx} -> {predicted_annotator_idx}"
+        )
+        for doc in gold_predicted[(gold_annotator_idx, predicted_annotator_idx)]:
+            if span_aligner is not None:
+                doc = span_aligner(doc)
+            if merger is not None:
+                doc = merger(doc)
+            spans_metric(doc)
+            relations_metric(doc)
+        metric_id = f"gold:{gold_annotator_idx},predicted:{predicted_annotator_idx}"
+        metric_values[metric_id] = {
+            "spans": spans_metric.compute(reset=True),
+            "relations": relations_metric.compute(reset=True),
+        }
+    result = pd.Series(unnest_dict(metric_values))
+    if show_results:
+        metric_values_series_mean = result.unstack(0).mean(axis=1)
+        metric_values_relations = metric_values_series_mean.xs("relations").unstack()
+        metric_values_spans = metric_values_series_mean.xs("spans").unstack()
+        print("\nspans:")
+        print(metric_values_spans.round(decimals=3).to_markdown())
+        print("\nrelations:")
+        print(metric_values_relations.round(decimals=3).to_markdown())
+    return result
+if __name__ == "__main__":
+    """
+    example call:
+    python calc_iaa_for_brat.py \
+    --annotation-dirs annotations/sciarg/v0.9/with_abstracts_rin annotations/sciarg/v0.9/with_abstracts_alisa \
+    --ignore-annotation-dir annotations/sciarg/v0.9/original
+    """
+    parser = argparse.ArgumentParser(
+        description="Calculate inter-annotator agreement for spans and relations in means of F1 "
+        "(exact match, i.e. offsets / arguments and labels must match) for two or more BRAT "
+        "annotation directories."
+    )
+    parser.add_argument(
+        "--annotation-dirs",
+        type=str,
+        required=True,
+        nargs="+",
+        help="List of annotation directories. At least two directories must be provided.",
+    )
+    parser.add_argument(
+        "--ignore-annotation-dir",
+        type=str,
+        default=None,
+        help="If set, ignore annotations loaded from this directory.",
+    )
+    parser.add_argument(
+        "--combine-fragmented-spans-via-relation",
+        type=str,
+        default=None,
+        help="If set, combine fragmented spans via this relation type.",
+    )
+    parser.add_argument(
+        "--sort-arguments-of-relations",
+        type=str,
+        default=None,
+        nargs="+",
+        help="If set, sort the arguments of the relations with the given labels.",
+    )
+    parser.add_argument(
+        "--align-spans",
+        action="store_true",
+        help="If set, align the spans of the predicted annotations to the gold annotations.",
+    )
+    parser.add_argument(
+        "--per-file",
+        action="store_true",
+        help="If set, calculate IAA per file.",
+    )
+    args = parser.parse_args()
+    metric_values_series = calc_brat_iaas(
+        annotator_dirs=args.annotation_dirs,
+        ignore_annotation_dir=args.ignore_annotation_dir,
+        combine_fragmented_spans_via_relation=args.combine_fragmented_spans_via_relation,
+        sort_arguments_of_relations=args.sort_arguments_of_relations,
+        align_spans=args.align_spans,
+        per_file=args.per_file,
+        show_results=True,
+    )

src/data/construct_sciarg_abstracts_remaining_gold_retrieval.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import pyrootutils
+root = pyrootutils.setup_root(
+    search_from=__file__,
+    indicator=[".project-root"],
+    pythonpath=True,
+    # dotenv=True,
+)
+import argparse
+import logging
+import os
+from collections import defaultdict
+from typing import List, Optional, Sequence, Tuple, TypeVar
+import pandas as pd
+from pie_datasets import load_dataset
+from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans
+from pytorch_ie.annotations import LabeledMultiSpan
+from pytorch_ie.documents import (
+    TextBasedDocument,
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+)
+from src.document.processing import replace_substrings_in_text_with_spaces
+logger = logging.getLogger(__name__)
+def multi_span_is_in_span(multi_span: LabeledMultiSpan, range_span: Tuple[int, int]) -> bool:
+    start, end = range_span
+    starts, ends = zip(*multi_span.slices)
+    return start <= min(starts) and max(ends) <= end
+def filter_multi_spans(
+    multi_spans: Sequence[LabeledMultiSpan], filter_span: Tuple[int, int]
+) -> List[LabeledMultiSpan]:
+    return [
+        span
+        for span in multi_spans
+        if multi_span_is_in_span(multi_span=span, range_span=filter_span)
+    ]
+def shift_multi_span_slices(
+    slices: Sequence[Tuple[int, int]], shift: int
+) -> List[Tuple[int, int]]:
+    return [(start + shift, end + shift) for start, end in slices]
+def construct_gold_retrievals(
+    doc: TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    symmetric_relations: Optional[List[str]] = None,
+    relation_label_whitelist: Optional[List[str]] = None,
+) -> Optional[pd.DataFrame]:
+    abstract_annotations = [
+        span for span in doc.labeled_partitions if span.label.lower().strip() == "abstract"
+    ]
+    if len(abstract_annotations) != 1:
+        logger.warning(
+            f"Expected exactly one abstract annotation, found {len(abstract_annotations)}"
+        )
+        return None
+    abstract_annotation = abstract_annotations[0]
+    span_abstract = (abstract_annotation.start, abstract_annotation.end)
+    span_remaining = (abstract_annotation.end, len(doc.text))
+    labeled_multi_spans = list(doc.labeled_multi_spans)
+    spans_in_abstract = set(
+        span for span in labeled_multi_spans if multi_span_is_in_span(span, span_abstract)
+    )
+    spans_in_remaining = set(
+        span for span in labeled_multi_spans if multi_span_is_in_span(span, span_remaining)
+    )
+    spans_not_covered = set(labeled_multi_spans) - spans_in_abstract - spans_in_remaining
+    if len(spans_not_covered) > 0:
+        logger.warning(
+            f"Found {len(spans_not_covered)} spans not covered by abstract or remaining text"
+        )
+    rel_arg_and_label2other = defaultdict(list)
+    for rel in doc.binary_relations:
+        rel_arg_and_label2other[rel.head].append((rel.tail, rel.label))
+        if symmetric_relations is not None and rel.label in symmetric_relations:
+            label_reversed = rel.label
+        else:
+            label_reversed = f"{rel.label}_reversed"
+        rel_arg_and_label2other[rel.tail].append((rel.head, label_reversed))
+    result_rows = []
+    for rel in doc.binary_relations:
+        # we check all semantically_same relations that point from (head) remaining to abstract (tail) ...
+        if rel.label == "semantically_same":
+            if rel.head in spans_in_abstract and rel.tail in spans_in_remaining:
+                # ... and if the head is
+                # candidate_query_span = rel.tail
+                candidate_spans_with_label = rel_arg_and_label2other[rel.tail]
+                for candidate_span, rel_label in candidate_spans_with_label:
+                    if (
+                        relation_label_whitelist is not None
+                        and rel_label not in relation_label_whitelist
+                    ):
+                        continue
+                    result_row = {
+                        "doc_id": f"{doc.id}.remaining.{span_remaining[0]}.txt",
+                        "query_doc_id": f"{doc.id}.abstract.{span_abstract[0]}_{span_abstract[1]}.txt",
+                        "span": shift_multi_span_slices(candidate_span.slices, -span_remaining[0]),
+                        "query_span": shift_multi_span_slices(rel.head.slices, -span_abstract[0]),
+                        "ref_span": shift_multi_span_slices(rel.tail.slices, -span_remaining[0]),
+                        "type": rel_label,
+                        "label": candidate_span.label,
+                        "ref_label": rel.tail.label,
+                    }
+                    result_rows.append(result_row)
+    if len(result_rows) > 0:
+        return pd.DataFrame(result_rows)
+    else:
+        return None
+D_text = TypeVar("D_text", bound=TextBasedDocument)
+def clean_doc(doc: D_text) -> D_text:
+    # remove xml tags. Note that we also remove the Abstract tag, in contrast to the preprocessing
+    # pipeline (see configs/dataset/sciarg_cleaned.yaml). This is because there, the abstracts are
+    # removed at completely.
+    doc = replace_substrings_in_text_with_spaces(
+        doc,
+        substrings=[
+            "</H2>",
+            "<H3>",
+            "</Document>",
+            "<H1>",
+            "<H2>",
+            "</H3>",
+            "</H1>",
+            "<Abstract>",
+            "</Abstract>",
+        ],
+    )
+    return doc
+def main(
+    data_dir: str,
+    out_path: str,
+    doc_id_whitelist: Optional[List[str]] = None,
+    symmetric_relations: Optional[List[str]] = None,
+    relation_label_whitelist: Optional[List[str]] = None,
+) -> None:
+    logger.info(f"Loading dataset from {data_dir}")
+    sciarg_with_abstracts = load_dataset(
+        "pie/sciarg",
+        revision="171478ce3c13cc484be5d7c9bc8f66d7d2f1c210",
+        base_dataset_kwargs={"data_dir": data_dir, "split_paths": None},
+        name="resolve_parts_of_same",
+        split="train",
+    )
+    if issubclass(sciarg_with_abstracts.document_type, BratDocument):
+        ds_converted = sciarg_with_abstracts.to_document_type(
+            TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions
+        )
+    elif issubclass(sciarg_with_abstracts.document_type, BratDocumentWithMergedSpans):
+        ds_converted = sciarg_with_abstracts.to_document_type(
+            TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+        )
+    else:
+        raise ValueError(f"Unsupported document type {sciarg_with_abstracts.document_type}")
+    ds_clean = ds_converted.map(clean_doc)
+    if doc_id_whitelist is not None:
+        num_before = len(ds_clean)
+        ds_clean = [doc for doc in ds_clean if doc.id in doc_id_whitelist]
+        logger.info(
+            f"Filtered dataset from {num_before} to {len(ds_clean)} documents based on doc_id_whitelist"
+        )
+    results_per_doc = [
+        construct_gold_retrievals(
+            doc,
+            symmetric_relations=symmetric_relations,
+            relation_label_whitelist=relation_label_whitelist,
+        )
+        for doc in ds_clean
+    ]
+    results_per_doc_not_empty = [doc for doc in results_per_doc if doc is not None]
+    if len(results_per_doc_not_empty) > 0:
+        results = pd.concat(results_per_doc_not_empty, ignore_index=True)
+        # sort to make the output deterministic
+        results = results.sort_values(
+            by=results.columns.tolist(), ignore_index=True, key=lambda s: s.apply(str)
+        )
+        os.makedirs(os.path.dirname(out_path), exist_ok=True)
+        logger.info(f"Saving result ({len(results)}) to {out_path}")
+        results.to_json(out_path)
+    else:
+        logger.warning("No results found")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Create gold retrievals for SciArg-abstracts-remaining in the same format as the retrieval results"
+    )
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="data/annotations/sciarg-with-abstracts-and-cross-section-rels",
+        help="Path to the sciarg data directory",
+    )
+    parser.add_argument(
+        "--out_path",
+        type=str,
+        default="data/retrieval_results/sciarg-with-abstracts-and-cross-section-rels/gold.json",
+        help="Path to save the results",
+    )
+    parser.add_argument(
+        "--symmetric_relations",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Relations that are symmetric, i.e., if A is related to B, then B is related to A",
+    )
+    parser.add_argument(
+        "--relation_label_whitelist",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Only consider relations with these labels",
+    )
+    logging.basicConfig(level=logging.INFO)
+    kwargs = vars(parser.parse_args())
+    main(**kwargs)
+    logger.info("Done")

src/data/prepare_sciarg_crosssection_annotations.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import argparse
+import logging
+import os
+import re
+import shutil
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+import pandas as pd
+from pie_datasets import Dataset, IterableDataset, load_dataset
+from pie_datasets.builders.brat import BratDocumentWithMergedSpans
+logger = logging.getLogger(__name__)
+def find_span_idx(raw_text: str, span_string: str) -> Optional[List]:
+    """
+    Match span string to raw text (document).
+    Return either
+    1) Tuple, 2) List of Tuples (more than one span match), or 3) empty List (no span match).
+    """
+    # remove possibly accidentally added white spaces
+    span_string.strip()
+    # use raw text input as regex-safe pattern
+    safe = re.escape(span_string)
+    pattern = rf"{safe}"
+    # find match(es)
+    out = [(s.start(), s.end()) for s in re.finditer(pattern, raw_text)]
+    return out
+def append_spans_start_and_end(
+    raw_text: str,
+    pd_table: pd.DataFrame,
+    input_cols: List[str],
+    input_idx_cols: List[str],
+    output_cols: List[str],
+    doc_id_col: str = "doc ID",
+) -> pd.DataFrame:
+    """
+    Create new column(s) for span indexes (i.e. start and end as Tuple) in pd.DataFrame from span strings.
+    Warn if
+    1) span string does not match anything in document -> None,
+    2) span string is not unique in the document -> List[Tuple].
+    """
+    pd_table = pd_table.join(pd.DataFrame(columns=output_cols))
+    for idx, pd_row in pd_table.iterrows():
+        for in_col, idx_col, out_col in zip(input_cols, input_idx_cols, output_cols):
+            span_indices = find_span_idx(raw_text, pd_row[in_col])
+            str_idx = pd_row[idx_col]
+            span_idx = None
+            if span_indices is None or len(span_indices) == 0:
+                logger.warning(
+                    f'The span "{pd_row[in_col]}" in Column "{in_col}" does not exist in {pd_row[doc_id_col]}.'
+                )
+            elif len(span_indices) == 1:
+                # warn if column is not empty, but span is unique
+                if str_idx == str_idx:
+                    logger.warning(f'Column "{idx_col}" is not empty. It has value: {str_idx}.')
+                span_idx = span_indices.pop()
+            else:
+                # warn if span not unique, but column is empty
+                if str_idx != str_idx:
+                    logger.warning(
+                        f'The span "{pd_row[in_col]}" in Column "{in_col}" is not unique,'
+                        f'but, column "{idx_col}" is empty. '
+                        f"Need a string index to specify the non-unique span."
+                    )
+                else:
+                    span_idx = span_indices.pop(int(str_idx))
+            if span_idx is not None:
+                pd_table.at[idx, out_col] = span_idx
+                # sanity check (NOTE: this should live in a test)
+                search_string = pd_row[in_col]
+                reconstructed_string = raw_text[span_idx[0] : span_idx[1]]
+                if search_string != reconstructed_string:
+                    raise ValueError(
+                        f"Reconstructed string does not match the original string. "
+                        f"Original: {search_string}, Reconstructed: {reconstructed_string}"
+                    )
+    return pd_table
+def get_texts_from_pie_dataset(
+    doc_ids: List[str], **dataset_kwargs
+) -> Dict[str, BratDocumentWithMergedSpans]:
+    """Get texts from a PIE dataset for a list of document IDs.
+    :param doc_ids: list of document IDs
+    :param dataset_kwargs: keyword arguments to pass to load_dataset
+    :return: a dictionary with document IDs as keys and texts as values
+    """
+    text_based_dataset = load_dataset(**dataset_kwargs)
+    if not isinstance(text_based_dataset, (Dataset, IterableDataset)):
+        raise ValueError(
+            f"Expected a PIE Dataset or PIE IterableDataset, but got a {type(text_based_dataset)} instead."
+        )
+    if not issubclass(text_based_dataset.document_type, BratDocumentWithMergedSpans):
+        raise ValueError(
+            f"Expected a PIE Dataset with BratDocumentWithMergedSpans as document type, "
+            f"but got {text_based_dataset.document_type} instead."
+        )
+    doc_id2text = {doc.id: doc for doc in text_based_dataset}
+    return {doc_id: doc_id2text[doc_id] for doc_id in doc_ids}
+def set_span_annotation_ids(
+    table: pd.DataFrame,
+    doc_id2doc: Dict[str, BratDocumentWithMergedSpans],
+    doc_id_col: str,
+    span_idx_cols: List[str],
+    span_id_cols: List[str],
+) -> pd.DataFrame:
+    """
+    Create new column(s) for span annotation IDs in pd.DataFrame from span indexes. The annotation IDs are
+    retrieved from the TextBasedDocument object using the span indexes.
+    :param table: pd.DataFrame with span indexes, document IDs, and other columns
+    :param doc_id2doc: dictionary with document IDs as keys and BratDocumentWithMergedSpans objects as values
+    :param doc_id_col: column name that contains document IDs
+    :param span_idx_cols: column names that contain span indexes
+    :param span_id_cols: column names for new span ID columns
+    :return: pd.DataFrame with new columns for span annotation IDs
+    """
+    table = table.join(pd.DataFrame(columns=span_id_cols))
+    span2id: Dict[str, Dict[Tuple[int, int], str]] = defaultdict(dict)
+    for doc_id, doc in doc_id2doc.items():
+        for span_id, span in zip(doc.metadata["span_ids"], doc.spans):
+            span2id[doc_id][(span.start, span.end)] = span_id
+    for span_idx_col, span_id_col in zip(span_idx_cols, span_id_cols):
+        table[span_id_col] = table.apply(
+            lambda row: span2id[row[doc_id_col]][tuple(row[span_idx_col])], axis=1
+        )
+    return table
+def set_relation_annotation_ids(
+    table: pd.DataFrame,
+    doc_id2doc: Dict[str, BratDocumentWithMergedSpans],
+    doc_id_col: str,
+    relation_id_col: str,
+) -> pd.DataFrame:
+    """create new column for relation annotation IDs in pd.DataFrame. They are simply new ids starting from the last
+    relation annotation id in the document.
+    :param table: pd.DataFrame with document IDs and other columns
+    :param doc_id2doc: dictionary with document IDs as keys and BratDocumentWithMergedSpans objects as values
+    :param doc_id_col: column name that contains document IDs
+    :param relation_id_col: column name for new relation ID column
+    :return: pd.DataFrame with new column for relation annotation IDs
+    """
+    table = table.join(pd.DataFrame(columns=[relation_id_col]))
+    doc_id2highest_relation_id = defaultdict(int)
+    for doc_id, doc in doc_id2doc.items():
+        # relation ids are prefixed with "R" in the dataset
+        doc_id2highest_relation_id[doc_id] = max(
+            [int(relation_id[1:]) for relation_id in doc.metadata["relation_ids"]]
+        )
+    for idx, row in table.iterrows():
+        doc_id = row[doc_id_col]
+        doc_id2highest_relation_id[doc_id] += 1
+        table.at[idx, relation_id_col] = f"R{doc_id2highest_relation_id[doc_id]}"
+    return table
+def main(
+    input_path: str,
+    output_path: str,
+    brat_data_dir: str,
+    input_encoding: str,
+    include_unsure: bool = False,
+    doc_id_col: str = "doc ID",
+    unsure_col: str = "unsure",
+    span_str_cols: List[str] = ["head argument string", "tail argument string"],
+    str_idx_cols: List[str] = ["head string index", "tail string index"],
+    span_idx_cols: List[str] = ["head_span_idx", "tail_span_idx"],
+    span_id_cols: List[str] = ["head_span_id", "tail_span_id"],
+    relation_id_col: str = "relation_id",
+    set_annotation_ids: bool = False,
+    relation_type: str = "relation",
+) -> None:
+    """
+    Convert long dependency annotations from a CSV file to a JSON format. The input table should have
+    columns for document IDs, argument span strings, and string indexes (required in the case that the
+    span string occurs multiple times in the base text). The argument span strings are matched to the
+    base text to get the start and end indexes of the span. The output JSON file will have the same
+    columns as the input file, plus two additional columns for the start and end indexes of the spans.
+    :param input_path: path to a CSV/Excel file that contains annotations
+    :param output_path: path to save JSON output
+    :param brat_data_dir: directory where the BRAT data (base texts and annotations) is located
+    :param input_encoding: encoding of the input file. Only used for CSV files. Default: "cp1252"
+    :param include_unsure: include annotations marked as unsure. Default: False
+    :param doc_id_col: column name that contains document IDs. Default: "doc ID"
+    :param unsure_col: column name that contains unsure annotations. Default: "unsure"
+    :param span_str_cols: column names that contain span strings. Default: ["head argument string", "tail argument string"]
+    :param str_idx_cols: column names that contain string indexes. Default: ["head string index", "tail string index"]
+    :param span_idx_cols: column names for new span-index columns. Default: ["head_span_idx", "tail_span_idx"]
+    :param span_id_cols: column names for new span-ID columns. Default: ["head_span_id", "tail_span_id"]
+    :param relation_id_col: column name for new relation-ID column. Default: "relation_id"
+    :param set_annotation_ids: set annotation IDs for the spans and relations. Default: False
+    :param relation_type: specify the relation type for the BRAT output. Default: "relation"
+    :return: None
+    """
+    # get annotations from a csv file
+    if input_path.lower().endswith(".csv"):
+        input_df = pd.read_csv(input_path, encoding=input_encoding)
+    elif input_path.lower().endswith(".xlsx"):
+        logger.warning(
+            f"encoding parameter (--input-encoding={input_encoding}) is ignored for Excel files."
+        )
+        input_df = pd.read_excel(input_path)
+    else:
+        raise ValueError("Input file has unexpected format. Please provide a CSV or Excel file.")
+    # remove unsure
+    if not include_unsure:
+        input_df = input_df[input_df[unsure_col].isna()]
+    # remove all empty columns
+    input_df = input_df.dropna(axis=1, how="all")
+    # define output DataFrame
+    result_df = pd.DataFrame(columns=[*input_df.columns, *span_idx_cols])
+    # get unique document IDs
+    doc_ids = list(input_df[doc_id_col].unique())
+    # get base texts from a PIE SciArg dataset
+    doc_id2doc = get_texts_from_pie_dataset(
+        doc_ids=doc_ids,
+        path="pie/brat",
+        name="merge_fragmented_spans",
+        split="train",
+        revision="769a15e44e7d691148dd05e54ae2b058ceaed1f0",
+        base_dataset_kwargs=dict(data_dir=brat_data_dir),
+    )
+    for doc_id in doc_ids:
+        # iterate over each sub-df that contains annotations for a single document
+        doc_df = input_df[input_df[doc_id_col] == doc_id]
+        input_df = input_df.drop(doc_df.index)
+        # get spans' start and end indexes as new columns
+        doc_with_span_indices_df = append_spans_start_and_end(
+            raw_text=doc_id2doc[doc_id].text,
+            pd_table=doc_df,
+            input_cols=span_str_cols,
+            input_idx_cols=str_idx_cols,
+            output_cols=span_idx_cols,
+        )
+        # append this sub-df (with spans' indexes columns) to result_df
+        result_df = pd.concat(
+            [result_df if not result_df.empty else None, doc_with_span_indices_df]
+        )
+    out_ext = os.path.splitext(output_path)[1]
+    save_as_brat = out_ext == ""
+    if set_annotation_ids or save_as_brat:
+        result_df = set_span_annotation_ids(
+            table=result_df,
+            doc_id2doc=doc_id2doc,
+            doc_id_col=doc_id_col,
+            span_idx_cols=span_idx_cols,
+            span_id_cols=span_id_cols,
+        )
+        result_df = set_relation_annotation_ids(
+            table=result_df,
+            doc_id2doc=doc_id2doc,
+            doc_id_col=doc_id_col,
+            relation_id_col=relation_id_col,
+        )
+    base_dir = os.path.dirname(output_path)
+    os.makedirs(base_dir, exist_ok=True)
+    if out_ext.lower() == ".json":
+        logger.info(f"Saving output in JSON format to {output_path} ...")
+        result_df.to_json(
+            path_or_buf=output_path,
+            orient="records",
+            lines=True,
+        )  # possible orient values: 'split','index', 'table','records', 'columns', 'values'
+    elif save_as_brat:
+        logger.info(f"Saving output in BRAT format to {output_path} ...")
+        os.makedirs(output_path, exist_ok=True)
+        for doc_id in doc_ids:
+            # handle the base text file (just copy from the BRAT data directory)
+            shutil.copy(
+                src=os.path.join(brat_data_dir, f"{doc_id}.txt"),
+                dst=os.path.join(output_path, f"{doc_id}.txt"),
+            )
+            # handle the annotation file
+            # first, read the original annotation file
+            input_ann_path = os.path.join(brat_data_dir, f"{doc_id}.ann")
+            with open(input_ann_path, "r") as f:
+                ann_lines = f.readlines()
+            # then, append new relation annotations
+            # The format for each line is (see https://brat.nlplab.org/standoff.html):
+            # R{relation_id}\t{relation_type} Arg1:{span_id1} Arg2:{span_id2}
+            doc_df = result_df[result_df[doc_id_col] == doc_id]
+            logger.info(f"Adding {len(doc_df)} relation annotations to {doc_id}.ann ...")
+            for idx, row in doc_df.iterrows():
+                head_span_id = row[span_id_cols[0]]
+                tail_span_id = row[span_id_cols[1]]
+                relation_id = row[relation_id_col]
+                ann_line = (
+                    f"{relation_id}\t{relation_type} Arg1:{head_span_id} Arg2:{tail_span_id}\n"
+                )
+                ann_lines.append(ann_line)
+            # finally, write the new annotation file
+            output_ann_path = os.path.join(output_path, f"{doc_id}.ann")
+            with open(output_ann_path, "w") as f:
+                f.writelines(ann_lines)
+    else:
+        raise ValueError(
+            "Output file has unexpected format. Please provide a JSON file or a directory."
+        )
+    logger.info("Done!")
+if __name__ == "__main__":
+    """
+    example call:
+    python src/data/prepare_sciarg_crosssection_annotations.py
+    // or //
+    python src/data/prepare_sciarg_crosssection_annotations.py \
+    --input-path data/annotations/sciarg-cross-section/aligned_input.csv \
+    --output-path data/annotations/sciarg-with-abstracts-and-cross-section-rels \
+    --brat-data-dir data/annotations/sciarg-abstracts/v0.9.3/alisa
+    """
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        description="Read text files in a directory and a CSV file that contains cross-section annotations. "
+        "Transform the CSV file to a JSON format and save at a specified output directory."
+    )
+    parser.add_argument(
+        "--input-path",
+        type=str,
+        default="data/annotations/sciarg-cross-section/aligned_input.csv",
+        help="Locate a CSV/Excel file.",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        default="data/annotations/sciarg-with-abstracts-and-cross-section-rels",
+        help="Specify a path where output will be saved. Should be a JSON file or a directory for BRAT output.",
+    )
+    parser.add_argument(
+        "--brat-data-dir",
+        type=str,
+        default="data/annotations/sciarg-abstracts/v0.9.3/alisa",
+        help="Specify the directory where the BRAT data (base texts and annotations) is located.",
+    )
+    parser.add_argument(
+        "--relation-type",
+        type=str,
+        default="semantically_same",
+        help="Specify the relation type for the BRAT output.",
+    )
+    parser.add_argument(
+        "--input-encoding",
+        type=str,
+        default="cp1252",
+        help="Specify encoding for reading an input file.",
+    )
+    parser.add_argument(
+        "--include-unsure",
+        action="store_true",
+        help="Include annotations marked as unsure.",
+    )
+    parser.add_argument(
+        "--set-annotation-ids",
+        action="store_true",
+        help="Set BRAT annotation IDs for the spans and relations.",
+    )
+    args = parser.parse_args()
+    kwargs = vars(args)
+    main(**kwargs)

src/data/split_sciarg_abstracts.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import pyrootutils
+root = pyrootutils.setup_root(
+    search_from=__file__,
+    indicator=[".project-root"],
+    pythonpath=True,
+    dotenv=True,
+)
+import argparse
+import logging
+import os
+from typing import List, Optional, TypeVar
+from pie_datasets import load_dataset
+from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans
+from pytorch_ie.documents import (
+    TextBasedDocument,
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    TextDocumentWithLabeledPartitions,
+    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+)
+from src.document.processing import replace_substrings_in_text_with_spaces
+logger = logging.getLogger(__name__)
+def save_abstract_and_remaining_text(
+    doc: TextDocumentWithLabeledPartitions, base_path: str
+) -> None:
+    abstract_annotations = [
+        span for span in doc.labeled_partitions if span.label.lower().strip() == "abstract"
+    ]
+    if len(abstract_annotations) != 1:
+        logger.warning(
+            f"Expected exactly one abstract annotation, found {len(abstract_annotations)}"
+        )
+        return
+    abstract_annotation = abstract_annotations[0]
+    text_abstract = doc.text[abstract_annotation.start : abstract_annotation.end]
+    text_remaining = doc.text[abstract_annotation.end :]
+    with open(
+        f"{base_path}.abstract.{abstract_annotation.start}_{abstract_annotation.end}.txt", "w"
+    ) as f:
+        f.write(text_abstract)
+    with open(f"{base_path}.remaining.{abstract_annotation.end}.txt", "w") as f:
+        f.write(text_remaining)
+D_text = TypeVar("D_text", bound=TextBasedDocument)
+def clean_doc(doc: D_text) -> D_text:
+    # remove xml tags. Note that we also remove the Abstract tag, in contrast to the preprocessing
+    # pipeline (see configs/dataset/sciarg_cleaned.yaml). This is because there, the abstracts are
+    # removed at completely.
+    doc = replace_substrings_in_text_with_spaces(
+        doc,
+        substrings=[
+            "</H2>",
+            "<H3>",
+            "</Document>",
+            "<H1>",
+            "<H2>",
+            "</H3>",
+            "</H1>",
+            "<Abstract>",
+            "</Abstract>",
+        ],
+    )
+    return doc
+def main(out_dir: str, doc_id_whitelist: Optional[List[str]] = None) -> None:
+    logger.info("Loading dataset from pie/sciarg")
+    sciarg_with_abstracts = load_dataset(
+        "pie/sciarg",
+        revision="171478ce3c13cc484be5d7c9bc8f66d7d2f1c210",
+        split="train",
+    )
+    if issubclass(sciarg_with_abstracts.document_type, BratDocument):
+        ds_converted = sciarg_with_abstracts.to_document_type(
+            TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions
+        )
+    elif issubclass(sciarg_with_abstracts.document_type, BratDocumentWithMergedSpans):
+        ds_converted = sciarg_with_abstracts.to_document_type(
+            TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+        )
+    else:
+        raise ValueError(f"Unsupported document type {sciarg_with_abstracts.document_type}")
+    ds_clean = ds_converted.map(clean_doc)
+    if doc_id_whitelist is not None:
+        num_before = len(ds_clean)
+        ds_clean = [doc for doc in ds_clean if doc.id in doc_id_whitelist]
+        logger.info(
+            f"Filtered dataset from {num_before} to {len(ds_clean)} documents based on doc_id_whitelist"
+        )
+    os.makedirs(out_dir, exist_ok=True)
+    logger.info(f"Saving dataset to {out_dir}")
+    for doc in ds_clean:
+        save_abstract_and_remaining_text(doc, os.path.join(out_dir, doc.id))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Split SciArg dataset into abstract and remaining text"
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        default="data/datasets/sciarg/abstracts_and_remaining_text",
+        help="Path to save the split data",
+    )
+    parser.add_argument(
+        "--doc_id_whitelist",
+        type=str,
+        nargs="+",
+        default=["A32", "A33", "A34", "A35", "A36", "A37", "A38", "A39", "A40"],
+        help="List of document ids to include in the split",
+    )
+    logging.basicConfig(level=logging.INFO)
+    kwargs = vars(parser.parse_args())
+    # allow for "all" to include all documents
+    if len(kwargs["doc_id_whitelist"]) == 1 and kwargs["doc_id_whitelist"][0].lower() == "all":
+        kwargs["doc_id_whitelist"] = None
+    main(**kwargs)
+    logger.info("Done")

src/demo/annotation_utils.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import logging
-from typing import Optional, Sequence, Union
 import gradio as gr
 from pie_modules.document.processing import RegexPartitioner, SpansViaRelationMerger
 # this is required to dynamically load the PIE models
@@ -10,7 +12,6 @@ from pie_modules.taskmodules import *  # noqa: F403
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
 from pytorch_ie.annotations import LabeledSpan
-from pytorch_ie.auto import AutoPipeline
 from pytorch_ie.documents import (
     TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
     TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
@@ -20,9 +21,25 @@ from pytorch_ie.documents import (
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.taskmodules import *  # noqa: F403
 logger = logging.getLogger(__name__)
 def annotate_document(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     argumentation_model: Pipeline,
@@ -40,23 +57,40 @@ def annotate_document(
     """
     # execute prediction pipeline
-    argumentation_model(document)
     if handle_parts_of_same:
-        merger = SpansViaRelationMerger(
-            relation_layer="binary_relations",
-            link_relation_label="parts_of_same",
-            create_multi_spans=True,
-            result_document_type=TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
-            result_field_mapping={
-                "labeled_spans": "labeled_multi_spans",
-                "binary_relations": "binary_relations",
-                "labeled_partitions": "labeled_partitions",
-            },
-        )
-        document = merger(document)
-    return document
 def create_document(
@@ -88,32 +122,45 @@ def create_document(
     return document
-def load_argumentation_model(
-    model_name: str,
-    revision: Optional[str] = None,
-    device: str = "cpu",
-) -> Pipeline:
     try:
-        # the Pipeline class expects an integer for the device
-        if device == "cuda":
-            pipeline_device = 0
-        elif device.startswith("cuda:"):
-            pipeline_device = int(device.split(":")[1])
-        elif device == "cpu":
-            pipeline_device = -1
-        else:
-            raise gr.Error(f"Invalid device: {device}")
-        model = AutoPipeline.from_pretrained(
-            model_name,
-            device=pipeline_device,
-            num_workers=0,
-            taskmodule_kwargs=dict(revision=revision),
-            model_kwargs=dict(revision=revision),
-        )
-        gr.Info(
-            f"Loaded argumentation model: model_name={model_name}, revision={revision}, device={device}"
-        )
     except Exception as e:
         raise gr.Error(f"Failed to load argumentation model: {e}")

+import json
 import logging
+from typing import Iterable, Optional, Sequence, Union
 import gradio as gr
+from hydra.utils import instantiate
 from pie_modules.document.processing import RegexPartitioner, SpansViaRelationMerger
 # this is required to dynamically load the PIE models
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
 from pytorch_ie.annotations import LabeledSpan
 from pytorch_ie.documents import (
     TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
     TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.taskmodules import *  # noqa: F403
+from src.utils import parse_config
 logger = logging.getLogger(__name__)
+def get_merger() -> SpansViaRelationMerger:
+    return SpansViaRelationMerger(
+        relation_layer="binary_relations",
+        link_relation_label="parts_of_same",
+        create_multi_spans=True,
+        result_document_type=TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+        result_field_mapping={
+            "labeled_spans": "labeled_multi_spans",
+            "binary_relations": "binary_relations",
+            "labeled_partitions": "labeled_partitions",
+        },
+    )
 def annotate_document(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     argumentation_model: Pipeline,
     """
     # execute prediction pipeline
+    result: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions = argumentation_model(
+        document, inplace=True
+    )
     if handle_parts_of_same:
+        merger = get_merger()
+        result = merger(result)
+    return result
+def annotate_documents(
+    documents: Sequence[TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions],
+    argumentation_model: Pipeline,
+    handle_parts_of_same: bool = False,
+) -> Union[
+    Sequence[TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions],
+    Sequence[TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions],
+]:
+    """Annotate a sequence of documents with the provided pipeline.
+    Args:
+        documents: The documents to annotate.
+        argumentation_model: The pipeline to use for annotation.
+        handle_parts_of_same: Whether to merge spans that are part of the same entity into a single multi span.
+    """
+    # execute prediction pipeline
+    result = argumentation_model(documents, inplace=True)
+    if handle_parts_of_same:
+        merger = get_merger()
+        result = [merger(document) for document in result]
+    return result
 def create_document(
     return document
+def create_documents(
+    texts: Iterable[str], doc_ids: Iterable[str], split_regex: Optional[str] = None
+) -> Sequence[TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
+    """Create a sequence of TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions from the provided
+    texts.
+    Parameters:
+        texts: The texts to process.
+        doc_ids: The IDs of the documents.
+        split_regex: A regular expression pattern to use for splitting the text into partitions.
+    Returns:
+        The processed documents.
+    """
+    return [
+        create_document(text=text, doc_id=doc_id, split_regex=split_regex)
+        for text, doc_id in zip(texts, doc_ids)
+    ]
+def load_argumentation_model(config_str: str, **kwargs) -> Pipeline:
     try:
+        config = parse_config(config_str, format="yaml")
+        # for PIE AutoPipeline, we need to handle the revision separately for
+        # the taskmodule and the model
+        if (
+            config.get("_target_") == "pytorch_ie.auto.AutoPipeline.from_pretrained"
+            and "revision" in config
+        ):
+            revision = config.pop("revision")
+            if "taskmodule_kwargs" not in config:
+                config["taskmodule_kwargs"] = {}
+            config["taskmodule_kwargs"]["revision"] = revision
+            if "model_kwargs" not in config:
+                config["model_kwargs"] = {}
+            config["model_kwargs"]["revision"] = revision
+        model = instantiate(config, **kwargs)
+        gr.Info(f"Loaded argumentation model: {json.dumps({**config, **kwargs})}")
     except Exception as e:
         raise gr.Error(f"Failed to load argumentation model: {e}")

src/demo/backend_utils.py CHANGED Viewed

@@ -2,17 +2,20 @@ import json
 import logging
 import os
 import tempfile
 from typing import Iterable, List, Optional, Sequence
 import gradio as gr
 import pandas as pd
 from pie_datasets import Dataset, IterableDataset, load_dataset
 from pytorch_ie import Pipeline
 from pytorch_ie.documents import (
     TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
 )
-from src.demo.annotation_utils import annotate_document, create_document
 from src.demo.data_utils import load_text_from_arxiv
 from src.demo.rendering_utils import (
     RENDER_WITH_DISPLACY,
@@ -25,6 +28,8 @@ from src.langchain_modules import (
     DocumentAwareSpanRetriever,
     DocumentAwareSpanRetrieverWithRelations,
 )
 logger = logging.getLogger(__name__)
@@ -58,20 +63,18 @@ def process_texts(
     # check that doc_ids are unique
     if len(set(doc_ids)) != len(list(doc_ids)):
         raise gr.Error("Document IDs must be unique.")
-    pie_documents = [
-        create_document(text=text, doc_id=doc_id, split_regex=split_regex_escaped)
-        for text, doc_id in zip(texts, doc_ids)
-    ]
     if verbose:
         gr.Info(f"Annotate {len(pie_documents)} documents...")
-    pie_documents = [
-        annotate_document(
-            document=pie_document,
-            argumentation_model=argumentation_model,
-            handle_parts_of_same=handle_parts_of_same,
-        )
-        for pie_document in pie_documents
-    ]
     add_annotated_pie_documents(
         retriever=retriever,
         pie_documents=pie_documents,
@@ -140,6 +143,94 @@ def process_uploaded_files(
     return retriever.docstore.overview(layer_captions=layer_captions, use_predictions=True)
 def wrapped_add_annotated_pie_documents_from_dataset(
     retriever: DocumentAwareSpanRetriever, verbose: bool, layer_captions: dict[str, str], **kwargs
 ) -> pd.DataFrame:
@@ -193,6 +284,7 @@ def render_annotated_document(
     document_id: str,
     render_with: str,
     render_kwargs_json: str,
 ) -> str:
     text, spans, span_id2idx, relations = get_text_spans_and_relations_from_document(
         retriever=retriever, document_id=document_id
@@ -213,6 +305,7 @@ def render_annotated_document(
             spans=spans,
             span_id2idx=span_id2idx,
             binary_relations=relations,
             **render_kwargs,
         )
     else:

 import logging
 import os
 import tempfile
+from pathlib import Path
 from typing import Iterable, List, Optional, Sequence
 import gradio as gr
 import pandas as pd
+from acl_anthology import Anthology
 from pie_datasets import Dataset, IterableDataset, load_dataset
 from pytorch_ie import Pipeline
 from pytorch_ie.documents import (
     TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
 )
+from tqdm import tqdm
+from src.demo.annotation_utils import annotate_documents, create_documents
 from src.demo.data_utils import load_text_from_arxiv
 from src.demo.rendering_utils import (
     RENDER_WITH_DISPLACY,
     DocumentAwareSpanRetriever,
     DocumentAwareSpanRetrieverWithRelations,
 )
+from src.utils.pdf_utils.acl_anthology_utils import XML2RawPapers
+from src.utils.pdf_utils.process_pdf import FulltextExtractor, PDFDownloader
 logger = logging.getLogger(__name__)
     # check that doc_ids are unique
     if len(set(doc_ids)) != len(list(doc_ids)):
         raise gr.Error("Document IDs must be unique.")
+    pie_documents = create_documents(
+        texts=texts,
+        doc_ids=doc_ids,
+        split_regex=split_regex_escaped,
+    )
     if verbose:
         gr.Info(f"Annotate {len(pie_documents)} documents...")
+    pie_documents = annotate_documents(
+        documents=pie_documents,
+        argumentation_model=argumentation_model,
+        handle_parts_of_same=handle_parts_of_same,
+    )
     add_annotated_pie_documents(
         retriever=retriever,
         pie_documents=pie_documents,
     return retriever.docstore.overview(layer_captions=layer_captions, use_predictions=True)
+def process_uploaded_pdf_files(
+    pdf_fulltext_extractor: Optional[FulltextExtractor],
+    file_names: List[str],
+    retriever: DocumentAwareSpanRetriever,
+    layer_captions: dict[str, str],
+    **kwargs,
+) -> pd.DataFrame:
+    try:
+        if pdf_fulltext_extractor is None:
+            raise gr.Error("PDF fulltext extractor is not available.")
+        doc_ids = []
+        texts = []
+        for file_name in file_names:
+            if file_name.lower().endswith(".pdf"):
+                # extract the fulltext from the pdf
+                text_and_extraction_data = pdf_fulltext_extractor(file_name)
+                if text_and_extraction_data is None:
+                    raise gr.Error(f"Failed to extract fulltext from PDF: {file_name}")
+                text, _ = text_and_extraction_data
+                base_file_name = os.path.basename(file_name)
+                doc_ids.append(base_file_name)
+                texts.append(text)
+            else:
+                raise gr.Error(f"Unsupported file format: {file_name}")
+        process_texts(texts=texts, doc_ids=doc_ids, retriever=retriever, verbose=True, **kwargs)
+    except Exception as e:
+        raise gr.Error(f"Failed to process uploaded files: {e}")
+    return retriever.docstore.overview(layer_captions=layer_captions, use_predictions=True)
+def load_acl_anthology_venues(
+    venues: List[str],
+    pdf_fulltext_extractor: Optional[FulltextExtractor],
+    retriever: DocumentAwareSpanRetriever,
+    layer_captions: dict[str, str],
+    acl_anthology_data_dir: Optional[str],
+    pdf_output_dir: Optional[str],
+    show_progress: bool = True,
+    **kwargs,
+) -> pd.DataFrame:
+    try:
+        if pdf_fulltext_extractor is None:
+            raise gr.Error("PDF fulltext extractor is not available.")
+        if acl_anthology_data_dir is None:
+            raise gr.Error("ACL Anthology data directory is not provided.")
+        if pdf_output_dir is None:
+            raise gr.Error("PDF output directory is not provided.")
+        xml2raw_papers = XML2RawPapers(
+            anthology=Anthology(datadir=Path(acl_anthology_data_dir)),
+            venue_id_whitelist=venues,
+            verbose=False,
+        )
+        pdf_downloader = PDFDownloader()
+        doc_ids = []
+        texts = []
+        os.makedirs(pdf_output_dir, exist_ok=True)
+        papers = xml2raw_papers()
+        if show_progress:
+            papers_list = list(papers)
+            papers = tqdm(papers_list, desc="extracting fulltext")
+            gr.Info(
+                f"Downloading and extracting fulltext from {len(papers_list)} papers in venues: {venues}"
+            )
+        for paper in papers:
+            if paper.url is not None:
+                pdf_save_path = pdf_downloader.download(
+                    paper.url, opath=Path(pdf_output_dir) / f"{paper.name}.pdf"
+                )
+                fulltext_extraction_output = pdf_fulltext_extractor(pdf_save_path)
+                if fulltext_extraction_output:
+                    text, _ = fulltext_extraction_output
+                    doc_id = f"aclanthology.org/{paper.name}"
+                    doc_ids.append(doc_id)
+                    texts.append(text)
+                else:
+                    gr.Warning(f"Failed to extract fulltext from PDF: {paper.url}")
+        process_texts(texts=texts, doc_ids=doc_ids, retriever=retriever, verbose=True, **kwargs)
+    except Exception as e:
+        raise gr.Error(f"Failed to process uploaded files: {e}")
+    return retriever.docstore.overview(layer_captions=layer_captions, use_predictions=True)
 def wrapped_add_annotated_pie_documents_from_dataset(
     retriever: DocumentAwareSpanRetriever, verbose: bool, layer_captions: dict[str, str], **kwargs
 ) -> pd.DataFrame:
     document_id: str,
     render_with: str,
     render_kwargs_json: str,
+    highlight_span_ids: Optional[List[str]] = None,
 ) -> str:
     text, spans, span_id2idx, relations = get_text_spans_and_relations_from_document(
         retriever=retriever, document_id=document_id
             spans=spans,
             span_id2idx=span_id2idx,
             binary_relations=relations,
+            highlight_span_ids=highlight_span_ids,
             **render_kwargs,
         )
     else:

src/demo/frontend_utils.py CHANGED Viewed

@@ -24,6 +24,18 @@ def close_accordion():
     return gr.Accordion(open=False)
 def change_tab(id: Union[int, str]):
     return gr.Tabs(selected=id)

     return gr.Accordion(open=False)
+def open_accordion_with_stats(
+    overview: pd.DataFrame, base_label: str, caption2column: dict[str, str], total_column: str
+):
+    caption2value = {
+        caption: len(overview) if column == total_column else overview[column].sum()
+        for caption, column in caption2column.items()
+    }
+    stats_str = ", ".join([f"{value} {caption}" for caption, value in caption2value.items()])
+    label = f"{base_label} ({stats_str})"
+    return gr.Accordion(open=True, label=label)
 def change_tab(id: Union[int, str]):
     return gr.Tabs(selected=id)

src/demo/rendering_utils.py CHANGED Viewed

@@ -15,7 +15,7 @@ AVAILABLE_RENDER_MODES = [RENDER_WITH_DISPLACY, RENDER_WITH_PRETTY_TABLE]
 # adjusted from rendering_utils_displacy.TPL_ENT
 TPL_ENT_WITH_ID = """
-<mark class="entity" data-entity-id="{entity_id}" data-slice-idx="{slice_idx}" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
     {text}
     <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">{label}</span>
 </mark>
@@ -31,8 +31,12 @@ HIGHLIGHT_SPANS_JS = """
             color = colors[colorDictKey];
         } catch (e) {}
         if (color) {
             entity.style.backgroundColor = color;
             entity.style.color = '#000';
         }
     }
@@ -42,6 +46,8 @@ HIGHLIGHT_SPANS_JS = """
         entities.forEach(entity => {
             const color = entity.getAttribute('data-color-original');
             entity.style.backgroundColor = color;
             entity.style.color = '';
         });
@@ -171,6 +177,7 @@ def render_displacy(
     spans: Union[Sequence[LabeledSpan], Sequence[LabeledMultiSpan]],
     span_id2idx: Dict[str, int],
     binary_relations: Sequence[BinaryRelation],
     inject_relations=True,
     colors_hover=None,
     entity_options={},
@@ -180,6 +187,9 @@ def render_displacy(
     ents: List[Dict[str, Any]] = []
     for entity_id, idx in span_id2idx.items():
         labeled_span = spans[idx]
         # pass the ID as a parameter to the entity. The id is required to fetch the entity annotations
         # on hover and to inject the relation data.
         if isinstance(labeled_span, LabeledSpan):
@@ -188,7 +198,11 @@ def render_displacy(
                     "start": labeled_span.start,
                     "end": labeled_span.end,
                     "label": labeled_span.label,
-                    "params": {"entity_id": entity_id, "slice_idx": 0},
                 }
             )
         elif isinstance(labeled_span, LabeledMultiSpan):
@@ -198,7 +212,11 @@ def render_displacy(
                         "start": start,
                         "end": end,
                         "label": labeled_span.label,
-                        "params": {"entity_id": entity_id, "slice_idx": i},
                     }
                 )
         else:
@@ -254,7 +272,9 @@ def inject_relation_data(
     entities = soup.find_all(class_="entity")
     for entity in entities:
         original_color = entity["style"].split("background:")[1].split(";")[0].strip()
         entity["data-color-original"] = original_color
         if additional_colors is not None:
             for key, color in additional_colors.items():
                 entity[f"data-color-{key}"] = (

 # adjusted from rendering_utils_displacy.TPL_ENT
 TPL_ENT_WITH_ID = """
+<mark class="entity" data-entity-id="{entity_id}" data-slice-idx="{slice_idx}" data-highlight-mode="{highlight_mode}" style="background: {bg}; border-width: {border_width}; border-color: {border_color}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
     {text}
     <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">{label}</span>
 </mark>
             color = colors[colorDictKey];
         } catch (e) {}
         if (color) {
+            //const highlightMode = entity.getAttribute('data-highlight-mode');
+            //if (highlightMode === 'fill') {
             entity.style.backgroundColor = color;
             entity.style.color = '#000';
+            //}
+            entity.style.borderColor = color;
         }
     }
         entities.forEach(entity => {
             const color = entity.getAttribute('data-color-original');
             entity.style.backgroundColor = color;
+            const borderColor = entity.getAttribute('data-border-color-original');
+            entity.style.borderColor = borderColor;
             entity.style.color = '';
         });
     spans: Union[Sequence[LabeledSpan], Sequence[LabeledMultiSpan]],
     span_id2idx: Dict[str, int],
     binary_relations: Sequence[BinaryRelation],
+    highlight_span_ids: Optional[List[str]] = None,
     inject_relations=True,
     colors_hover=None,
     entity_options={},
     ents: List[Dict[str, Any]] = []
     for entity_id, idx in span_id2idx.items():
         labeled_span = spans[idx]
+        highlight_mode = (
+            "fill" if highlight_span_ids is None or entity_id in highlight_span_ids else "border"
+        )
         # pass the ID as a parameter to the entity. The id is required to fetch the entity annotations
         # on hover and to inject the relation data.
         if isinstance(labeled_span, LabeledSpan):
                     "start": labeled_span.start,
                     "end": labeled_span.end,
                     "label": labeled_span.label,
+                    "params": {
+                        "entity_id": entity_id,
+                        "slice_idx": 0,
+                        "highlight_mode": highlight_mode,
+                    },
                 }
             )
         elif isinstance(labeled_span, LabeledMultiSpan):
                         "start": start,
                         "end": end,
                         "label": labeled_span.label,
+                        "params": {
+                            "entity_id": entity_id,
+                            "slice_idx": i,
+                            "highlight_mode": highlight_mode,
+                        },
                     }
                 )
         else:
     entities = soup.find_all(class_="entity")
     for entity in entities:
         original_color = entity["style"].split("background:")[1].split(";")[0].strip()
+        original_border_color = entity["style"].split("border-color:")[1].split(";")[0].strip()
         entity["data-color-original"] = original_color
+        entity["data-border-color-original"] = original_border_color
         if additional_colors is not None:
             for key, color in additional_colors.items():
                 entity[f"data-color-{key}"] = (

src/demo/rendering_utils_displacy.py CHANGED Viewed

@@ -200,7 +200,18 @@ class EntityRenderer(object):
                     markup += "<br/>"
             if self.ents is None or label.upper() in self.ents:
                 color = self.colors.get(label.upper(), self.default_color)
-                ent_settings = {"label": label, "text": entity, "bg": color}
                 ent_settings.update(additional_params)
                 markup += self.ent_template.format(**ent_settings)
             else:

                     markup += "<br/>"
             if self.ents is None or label.upper() in self.ents:
                 color = self.colors.get(label.upper(), self.default_color)
+                ent_settings = {"label": label, "text": entity}
+                highlight_mode = additional_params.get("highlight_mode", "fill")
+                if highlight_mode == "fill":
+                    ent_settings["bg"] = color
+                    ent_settings["border_width"] = "0px"
+                    ent_settings["border_color"] = color
+                elif highlight_mode == "border":
+                    ent_settings["bg"] = "inherit"
+                    ent_settings["border_width"] = "2px"
+                    ent_settings["border_color"] = color
+                else:
+                    raise ValueError(f"Invalid highlight_mode: {highlight_mode}")
                 ent_settings.update(additional_params)
                 markup += self.ent_template.format(**ent_settings)
             else:

src/demo/retrieve_and_dump_all_relevant.py CHANGED Viewed

@@ -9,6 +9,9 @@ root = pyrootutils.setup_root(
 import argparse
 import logging
 from src.demo.retriever_utils import (
     retrieve_all_relevant_spans,
@@ -55,6 +58,29 @@ if __name__ == "__main__":
         default=None,
         help="If provided, retrieve all spans for only this query span.",
     )
     args = parser.parse_args()
     logging.basicConfig(
@@ -74,9 +100,41 @@ if __name__ == "__main__":
     retriever.load_from_disc(args.data_path)
     search_kwargs = {"k": args.top_k, "score_threshold": args.threshold}
     logger.info(f"use search_kwargs: {search_kwargs}")
-    if args.query_span_id is not None:
         logger.warning(f"retrieving results for single span: {args.query_span_id}")
         all_spans_for_all_documents = retrieve_relevant_spans(
             retriever=retriever, query_span_id=args.query_span_id, **search_kwargs
@@ -95,7 +153,8 @@ if __name__ == "__main__":
         logger.warning("no relevant spans found in any document")
         exit(0)
-    logger.info(f"dumping results to {args.output_path}...")
     all_spans_for_all_documents.to_json(args.output_path)
     logger.info("done")

 import argparse
 import logging
+import os
+import pandas as pd
 from src.demo.retriever_utils import (
     retrieve_all_relevant_spans,
         default=None,
         help="If provided, retrieve all spans for only this query span.",
     )
+    parser.add_argument(
+        "--doc_id_whitelist",
+        type=str,
+        nargs="+",
+        default=None,
+        help="If provided, only consider documents with these IDs.",
+    )
+    parser.add_argument(
+        "--doc_id_blacklist",
+        type=str,
+        nargs="+",
+        default=None,
+        help="If provided, ignore documents with these IDs.",
+    )
+    parser.add_argument(
+        "--query_target_doc_id_pairs",
+        type=str,
+        nargs="+",
+        default=None,
+        help="One or more pairs of query and target document IDs "
+        '(each separated by ":") to retrieve spans for. If provided, '
+        "--query_doc_id and --query_span_id are ignored.",
+    )
     args = parser.parse_args()
     logging.basicConfig(
     retriever.load_from_disc(args.data_path)
     search_kwargs = {"k": args.top_k, "score_threshold": args.threshold}
+    if args.doc_id_whitelist is not None:
+        search_kwargs["doc_id_whitelist"] = args.doc_id_whitelist
+    if args.doc_id_blacklist is not None:
+        search_kwargs["doc_id_blacklist"] = args.doc_id_blacklist
     logger.info(f"use search_kwargs: {search_kwargs}")
+    if args.query_target_doc_id_pairs is not None:
+        all_spans_for_all_documents = None
+        for doc_id_pair in args.query_target_doc_id_pairs:
+            query_doc_id, target_doc_id = doc_id_pair.split(":")
+            current_result = retrieve_all_relevant_spans(
+                retriever=retriever,
+                query_doc_id=query_doc_id,
+                doc_id_whitelist=[target_doc_id],
+                **search_kwargs,
+            )
+            if current_result is None:
+                logger.warning(
+                    f"no relevant spans found for query_doc_id={query_doc_id} and "
+                    f"target_doc_id={target_doc_id}"
+                )
+                continue
+            logger.info(
+                f"retrieved {len(current_result)} spans for query_doc_id={query_doc_id} "
+                f"and target_doc_id={target_doc_id}"
+            )
+            current_result["query_doc_id"] = query_doc_id
+            if all_spans_for_all_documents is None:
+                all_spans_for_all_documents = current_result
+            else:
+                all_spans_for_all_documents = pd.concat(
+                    [all_spans_for_all_documents, current_result], ignore_index=True
+                )
+    elif args.query_span_id is not None:
         logger.warning(f"retrieving results for single span: {args.query_span_id}")
         all_spans_for_all_documents = retrieve_relevant_spans(
             retriever=retriever, query_span_id=args.query_span_id, **search_kwargs
         logger.warning("no relevant spans found in any document")
         exit(0)
+    logger.info(f"dumping results ({len(all_spans_for_all_documents)}) to {args.output_path}...")
+    os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
     all_spans_for_all_documents.to_json(args.output_path)
     logger.info("done")

src/demo/retriever_utils.py CHANGED Viewed

@@ -8,10 +8,8 @@ from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
 from typing_extensions import Protocol
 from src.langchain_modules import DocumentAwareSpanRetriever
-from src.langchain_modules.span_retriever import (
-    DocumentAwareSpanRetrieverWithRelations,
-    _parse_config,
-)
 logger = logging.getLogger(__name__)
@@ -22,13 +20,13 @@ def get_document_as_dict(retriever: DocumentAwareSpanRetriever, doc_id: str) ->
 def load_retriever(
-    retriever_config_str: str,
     config_format: str,
     device: str = "cpu",
     previous_retriever: Optional[DocumentAwareSpanRetrieverWithRelations] = None,
 ) -> DocumentAwareSpanRetrieverWithRelations:
     try:
-        retriever_config = _parse_config(retriever_config_str, format=config_format)
         # set device for the embeddings pipeline
         retriever_config["vectorstore"]["embedding"]["pipeline_kwargs"]["device"] = device
         result = DocumentAwareSpanRetrieverWithRelations.instantiate_from_config(retriever_config)
@@ -153,6 +151,7 @@ def _retrieve_for_all_spans(
     query_doc_id: str,
     retrieve_func: RetrieverCallable,
     query_span_id_column: str = "query_span_id",
     **kwargs,
 ) -> Optional[pd.DataFrame]:
     if not query_doc_id.strip():
@@ -177,6 +176,9 @@ def _retrieve_for_all_spans(
         # add column with query_span_id
         for query_span_id, query_span_result in span_results_not_empty.items():
             query_span_result[query_span_id_column] = query_span_id
         if len(span_results_not_empty) == 0:
             gr.Info(f"No results found for any ADU in document {query_doc_id}.")

 from typing_extensions import Protocol
 from src.langchain_modules import DocumentAwareSpanRetriever
+from src.langchain_modules.span_retriever import DocumentAwareSpanRetrieverWithRelations
+from src.utils import parse_config
 logger = logging.getLogger(__name__)
 def load_retriever(
+    config_str: str,
     config_format: str,
     device: str = "cpu",
     previous_retriever: Optional[DocumentAwareSpanRetrieverWithRelations] = None,
 ) -> DocumentAwareSpanRetrieverWithRelations:
     try:
+        retriever_config = parse_config(config_str, format=config_format)
         # set device for the embeddings pipeline
         retriever_config["vectorstore"]["embedding"]["pipeline_kwargs"]["device"] = device
         result = DocumentAwareSpanRetrieverWithRelations.instantiate_from_config(retriever_config)
     query_doc_id: str,
     retrieve_func: RetrieverCallable,
     query_span_id_column: str = "query_span_id",
+    query_span_text_column: Optional[str] = None,
     **kwargs,
 ) -> Optional[pd.DataFrame]:
     if not query_doc_id.strip():
         # add column with query_span_id
         for query_span_id, query_span_result in span_results_not_empty.items():
             query_span_result[query_span_id_column] = query_span_id
+            if query_span_text_column is not None:
+                query_span = retriever.get_span_by_id(span_id=query_span_id)
+                query_span_result[query_span_text_column] = str(query_span)
         if len(span_results_not_empty) == 0:
             gr.Info(f"No results found for any ADU in document {query_doc_id}.")

src/document/processing.py CHANGED Viewed

@@ -1,13 +1,17 @@
 from __future__ import annotations
 import logging
-from typing import Any, Dict, Iterable, List, Sequence, Set, Tuple, TypeVar, Union
-import networkx as nx
-from pie_modules.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
 from pie_modules.documents import TextDocumentWithLabeledMultiSpansAndBinaryRelations
 from pytorch_ie import AnnotationLayer
 from pytorch_ie.core import Document
 logger = logging.getLogger(__name__)
@@ -64,76 +68,7 @@ def remove_overlapping_entities(
     return new_doc
-def _merge_spans_via_relation(
-    spans: Sequence[LabeledSpan],
-    relations: Sequence[BinaryRelation],
-    link_relation_label: str,
-    create_multi_spans: bool = True,
-) -> Tuple[Union[Set[LabeledSpan], Set[LabeledMultiSpan]], Set[BinaryRelation]]:
-    # convert list of relations to a graph to easily calculate connected components to merge
-    g = nx.Graph()
-    link_relations = []
-    other_relations = []
-    for rel in relations:
-        if rel.label == link_relation_label:
-            link_relations.append(rel)
-            # never merge spans that have not the same label
-            if (
-                not (isinstance(rel.head, LabeledSpan) or isinstance(rel.tail, LabeledSpan))
-                or rel.head.label == rel.tail.label
-            ):
-                g.add_edge(rel.head, rel.tail)
-            else:
-                logger.debug(
-                    f"spans to merge do not have the same label, do not merge them: {rel.head}, {rel.tail}"
-                )
-        else:
-            other_relations.append(rel)
-    span_mapping = {}
-    connected_components: Set[LabeledSpan]
-    for connected_components in nx.connected_components(g):
-        # all spans in a connected component have the same label
-        label = list(span.label for span in connected_components)[0]
-        connected_components_sorted = sorted(connected_components, key=lambda span: span.start)
-        if create_multi_spans:
-            new_span = LabeledMultiSpan(
-                slices=tuple((span.start, span.end) for span in connected_components_sorted),
-                label=label,
-            )
-        else:
-            new_span = LabeledSpan(
-                start=min(span.start for span in connected_components_sorted),
-                end=max(span.end for span in connected_components_sorted),
-                label=label,
-            )
-        for span in connected_components_sorted:
-            span_mapping[span] = new_span
-    for span in spans:
-        if span not in span_mapping:
-            if create_multi_spans:
-                span_mapping[span] = LabeledMultiSpan(
-                    slices=((span.start, span.end),), label=span.label, score=span.score
-                )
-            else:
-                span_mapping[span] = LabeledSpan(
-                    start=span.start, end=span.end, label=span.label, score=span.score
-                )
-    new_spans = set(span_mapping.values())
-    new_relations = set(
-        BinaryRelation(
-            head=span_mapping[rel.head],
-            tail=span_mapping[rel.tail],
-            label=rel.label,
-            score=rel.score,
-        )
-        for rel in other_relations
-    )
-    return new_spans, new_relations
 def merge_spans_via_relation(
     document: D,
     relation_layer: str,
@@ -186,15 +121,50 @@ def merge_spans_via_relation(
 def remove_partitions_by_labels(
-    document: D, partition_layer: str, label_blacklist: List[str]
 ) -> D:
     document = document.copy()
-    layer: AnnotationLayer = document[partition_layer]
     new_partitions = []
-    for partition in layer.clear():
         if partition.label not in label_blacklist:
             new_partitions.append(partition)
-    layer.extend(new_partitions)
     return document
@@ -221,3 +191,168 @@ def replace_substrings_in_text(
 def replace_substrings_in_text_with_spaces(document: D_text, substrings: Iterable[str]) -> D_text:
     replacements = {substring: " " * len(substring) for substring in substrings}
     return replace_substrings_in_text(document, replacements=replacements)

 from __future__ import annotations
 import logging
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, TypeVar
+from pie_modules.document.processing.merge_spans_via_relation import _merge_spans_via_relation
 from pie_modules.documents import TextDocumentWithLabeledMultiSpansAndBinaryRelations
+from pie_modules.utils.span import have_overlap
 from pytorch_ie import AnnotationLayer
 from pytorch_ie.core import Document
+from pytorch_ie.core.document import Annotation, _enumerate_dependencies
+from src.utils import distance
+from src.utils.span_utils import get_overlap_len
 logger = logging.getLogger(__name__)
     return new_doc
+# TODO: remove and use pie_modules.document.processing.SpansViaRelationMerger instead
 def merge_spans_via_relation(
     document: D,
     relation_layer: str,
 def remove_partitions_by_labels(
+    document: D, partition_layer: str, label_blacklist: List[str], span_layer: Optional[str] = None
 ) -> D:
+    """Remove partitions with labels in the blacklist from a document.
+    Args:
+        document: The document to process.
+        partition_layer: The name of the partition layer.
+        label_blacklist: The list of labels to remove.
+        span_layer: The name of the span layer to remove spans from if they are not fully
+            contained in any remaining partition. Any dependent annotations will be removed as well.
+    Returns:
+        The processed document.
+    """
     document = document.copy()
+    p_layer: AnnotationLayer = document[partition_layer]
     new_partitions = []
+    for partition in p_layer.clear():
         if partition.label not in label_blacklist:
             new_partitions.append(partition)
+    p_layer.extend(new_partitions)
+    if span_layer is not None:
+        result = document.copy(with_annotations=False)
+        removed_span_ids = set()
+        for span in document[span_layer]:
+            # keep spans fully contained in any partition
+            if any(
+                partition.start <= span.start and span.end <= partition.end
+                for partition in new_partitions
+            ):
+                result[span_layer].append(span.copy())
+            else:
+                removed_span_ids.add(span._id)
+        result.add_all_annotations_from_other(
+            document,
+            removed_annotations={span_layer: removed_span_ids},
+            strict=False,
+            verbose=False,
+        )
+        document = result
     return document
 def replace_substrings_in_text_with_spaces(document: D_text, substrings: Iterable[str]) -> D_text:
     replacements = {substring: " " * len(substring) for substring in substrings}
     return replace_substrings_in_text(document, replacements=replacements)
+def relabel_annotations(
+    document: D,
+    label_mapping: Dict[str, Dict[str, str]],
+) -> D:
+    """
+    Replace annotation labels in a document.
+    Args:
+        document: The document to process.
+        label_mapping: A mapping from layer names to mappings from old labels to new labels.
+    Returns:
+        The processed document.
+    """
+    dependency_ordered_fields: List[str] = []
+    _enumerate_dependencies(
+        dependency_ordered_fields,
+        dependency_graph=document._annotation_graph,
+        nodes=document._annotation_graph["_artificial_root"],
+    )
+    result = document.copy(with_annotations=False)
+    store: Dict[int, Annotation] = {}
+    # not yet used
+    invalid_annotation_ids: Set[int] = set()
+    for field_name in dependency_ordered_fields:
+        if field_name in document._annotation_fields:
+            layer = document[field_name]
+            for is_prediction, anns in [(False, layer), (True, layer.predictions)]:
+                for ann in anns:
+                    new_ann = ann.copy_with_store(
+                        override_annotation_store=store,
+                        invalid_annotation_ids=invalid_annotation_ids,
+                    )
+                    if field_name in label_mapping:
+                        if ann.label in label_mapping[field_name]:
+                            new_label = label_mapping[field_name][ann.label]
+                            new_ann = new_ann.copy(label=new_label)
+                        else:
+                            raise ValueError(
+                                f"Label {ann.label} not found in label mapping for {field_name}"
+                            )
+                    store[ann._id] = new_ann
+                    target_layer = result[field_name]
+                    if is_prediction:
+                        target_layer.predictions.append(new_ann)
+                    else:
+                        target_layer.append(new_ann)
+    return result
+DWithSpans = TypeVar("DWithSpans", bound=Document)
+def align_predicted_span_annotations(
+    document: DWithSpans, span_layer: str, distance_type: str = "center", verbose: bool = False
+) -> DWithSpans:
+    """
+    Aligns predicted span annotations with the closest gold spans in a document.
+    First, calculates the distance between each predicted span and each gold span. Then,
+    for each predicted span, the gold span with the smallest distance is selected. If the
+    predicted span and the gold span have an overlap of at least half of the maximum length
+    of the two spans, the predicted span is aligned with the gold span.
+    Args:
+        document: The document to process.
+        span_layer: The name of the span layer.
+        distance_type: The type of distance to calculate. One of: center, inner, outer
+        verbose: Whether to print debug information.
+    Returns:
+        The processed document.
+    """
+    gold_spans = document[span_layer]
+    if len(gold_spans) == 0:
+        return document.copy()
+    pred_spans = document[span_layer].predictions
+    old2new_pred_span = {}
+    span_id2gold_span = {}
+    for pred_span in pred_spans:
+        gold_spans_with_distance = [
+            (
+                gold_span,
+                distance(
+                    start_end=(pred_span.start, pred_span.end),
+                    other_start_end=(gold_span.start, gold_span.end),
+                    distance_type=distance_type,
+                ),
+            )
+            for gold_span in gold_spans
+        ]
+        closest_gold_span, min_distance = min(gold_spans_with_distance, key=lambda x: x[1])
+        # if the closest gold span is the same as the predicted span, we don't need to align
+        if min_distance == 0.0:
+            continue
+        if have_overlap(
+            start_end=(pred_span.start, pred_span.end),
+            other_start_end=(closest_gold_span.start, closest_gold_span.end),
+        ):
+            overlap_len = get_overlap_len(
+                (pred_span.start, pred_span.end), (closest_gold_span.start, closest_gold_span.end)
+            )
+            # get the maximum length of the two spans
+            l_max = max(
+                pred_span.end - pred_span.start, closest_gold_span.end - closest_gold_span.start
+            )
+            # if the overlap is at least half of the maximum length, we consider it a valid match for alignment
+            valid_match = overlap_len >= (l_max / 2)
+        else:
+            valid_match = False
+        if valid_match:
+            aligned_pred_span = pred_span.copy(
+                start=closest_gold_span.start, end=closest_gold_span.end
+            )
+            old2new_pred_span[pred_span._id] = aligned_pred_span
+            span_id2gold_span[pred_span._id] = closest_gold_span
+    result = document.copy(with_annotations=False)
+    # multiple predicted spans can be aligned with the same gold span,
+    # so we need to keep track of the added spans
+    added_pred_span_ids = dict()
+    for pred_span in pred_spans:
+        # just add the predicted span if it was not aligned with a gold span
+        if pred_span._id not in old2new_pred_span:
+            # if this was not added before (e.g. as aligned span), add it
+            if pred_span._id not in added_pred_span_ids:
+                keep_pred_span = pred_span.copy()
+                result[span_layer].predictions.append(keep_pred_span)
+                added_pred_span_ids[pred_span._id] = keep_pred_span
+            elif verbose:
+                print(f"Skipping duplicate predicted span. pred_span='{str(pred_span)}'")
+        else:
+            aligned_pred_span = old2new_pred_span[pred_span._id]
+            # if this was not added before (e.g. as aligned or original pred span), add it
+            if aligned_pred_span._id not in added_pred_span_ids:
+                result[span_layer].predictions.append(aligned_pred_span)
+                added_pred_span_ids[aligned_pred_span._id] = aligned_pred_span
+            elif verbose:
+                prev_pred_span = added_pred_span_ids[aligned_pred_span._id]
+                gold_span = span_id2gold_span[pred_span._id]
+                print(
+                    f"Skipping duplicate aligned predicted span. aligned gold_span='{str(gold_span)}', "
+                    f"prev_pred_span='{str(prev_pred_span)}', current_pred_span='{str(pred_span)}'"
+                )
+                # print("bbb")
+    result[span_layer].extend([span.copy() for span in gold_spans])
+    # add remaining gold and predicted spans (the result, _aligned_spans, is just for debugging)
+    _aligned_spans = result.add_all_annotations_from_other(
+        document, override_annotations={span_layer: old2new_pred_span}
+    )
+    return result

src/hydra_callbacks/save_job_return_value.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import os
 import pickle
 from pathlib import Path
-from typing import Any, Dict, Generator, List, Tuple, Union
 import numpy as np
 import pandas as pd
@@ -33,38 +33,48 @@ def to_py_obj(obj):
 def list_of_dicts_to_dict_of_lists_recursive(list_of_dicts):
     """Convert a list of dicts to a dict of lists recursively.
-    Example:
         # works with nested dicts
         >>> list_of_dicts_to_dict_of_lists_recursive([{"a": 1, "b": {"c": 2}}, {"a": 3, "b": {"c": 4}}])
-        {'b': {'c': [2, 4]}, 'a': [1, 3]}
         # works with incomplete dicts
         >>> list_of_dicts_to_dict_of_lists_recursive([{"a": 1, "b": 2}, {"a": 3}])
-        {'b': [2, None], 'a': [1, 3]}
     Args:
         list_of_dicts (List[dict]): A list of dicts.
     Returns:
-        dict: A dict of lists.
     """
-    if isinstance(list_of_dicts, list):
-        if len(list_of_dicts) == 0:
-            return {}
-        elif isinstance(list_of_dicts[0], dict):
-            keys = set()
-            for d in list_of_dicts:
-                if not isinstance(d, dict):
-                    raise ValueError("Not all elements of the list are dicts.")
                 keys.update(d.keys())
-            return {
-                k: list_of_dicts_to_dict_of_lists_recursive(
-                    [d.get(k, None) for d in list_of_dicts]
-                )
-                for k in keys
-            }
-        else:
-            return list_of_dicts
     else:
         return list_of_dicts
@@ -77,20 +87,50 @@ def _flatten_dict_gen(d, parent_key: Tuple[str, ...] = ()) -> Generator:
             yield new_key, v
-def flatten_dict(d: Dict[str, Any]) -> Dict[Tuple[str, ...], Any]:
-    return dict(_flatten_dict_gen(d))
-def unflatten_dict(d: Dict[Tuple[str, ...], Any]) -> Union[Dict[str, Any], Any]:
-    """Unflattens a dictionary with nested keys.
     Example:
         >>> d = {("a", "b", "c"): 1, ("a", "b", "d"): 2, ("a", "e"): 3}
         >>> unflatten_dict(d)
         {'a': {'b': {'c': 1, 'd': 2}, 'e': 3}}
     """
     result: Dict[str, Any] = {}
     for k, v in d.items():
         if len(k) == 0:
             if len(result) > 1:
                 raise ValueError("Cannot unflatten dictionary with multiple root keys.")
@@ -152,30 +192,82 @@ class SaveJobReturnValueCallback(Callback):
         nested), where the keys are the keys of the job return-values and the values are lists of the corresponding
         values of all jobs. This is useful if you want to access specific values of all jobs in a multi-run all at once.
         Also, aggregated values (e.g. mean, min, max) are created for all numeric values and saved in another file.
     """
     def __init__(
         self,
         filenames: Union[str, List[str]] = "job_return_value.json",
         integrate_multirun_result: bool = False,
     ) -> None:
         self.log = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
         self.filenames = [filenames] if isinstance(filenames, str) else filenames
         self.integrate_multirun_result = integrate_multirun_result
         self.job_returns: List[JobReturn] = []
     def on_job_end(self, config: DictConfig, job_return: JobReturn, **kwargs: Any) -> None:
         self.job_returns.append(job_return)
         output_dir = Path(config.hydra.runtime.output_dir)  # / Path(config.hydra.output_subdir)
         for filename in self.filenames:
             self._save(obj=job_return.return_value, filename=filename, output_dir=output_dir)
     def on_multirun_end(self, config: DictConfig, **kwargs: Any) -> None:
         if self.integrate_multirun_result:
             # rearrange the job return-values of all jobs from a multi-run into a dict of lists (maybe nested),
             obj = list_of_dicts_to_dict_of_lists_recursive(
                 [jr.return_value for jr in self.job_returns]
             )
             # also create an aggregated result
             # convert to python object to allow selecting numeric columns
             obj_py = to_py_obj(obj)
@@ -195,6 +287,11 @@ class SaveJobReturnValueCallback(Callback):
             else:
                 # aggregate the numeric values
                 df_described = df_numbers_only.describe()
                 # add the aggregation keys (e.g. mean, min, ...) as most inner keys and convert back to dict
                 obj_flat_aggregated = df_described.T.stack().to_dict()
                 # unflatten because _save() works better with nested dicts
@@ -202,25 +299,46 @@ class SaveJobReturnValueCallback(Callback):
         else:
             # create a dict of the job return-values of all jobs from a multi-run
             # (_save() works better with nested dicts)
-            ids = overrides_to_identifiers([jr.overrides for jr in self.job_returns])
-            obj = {identifier: jr.return_value for identifier, jr in zip(ids, self.job_returns)}
             obj_aggregated = None
         output_dir = Path(config.hydra.sweep.dir)
         for filename in self.filenames:
             self._save(
                 obj=obj,
                 filename=filename,
                 output_dir=output_dir,
-                multi_run_result=self.integrate_multirun_result,
             )
             # if available, also save the aggregated result
             if obj_aggregated is not None:
                 file_base_name, ext = os.path.splitext(filename)
                 filename_aggregated = f"{file_base_name}.aggregated{ext}"
-                self._save(obj=obj_aggregated, filename=filename_aggregated, output_dir=output_dir)
     def _save(
-        self, obj: Any, filename: str, output_dir: Path, multi_run_result: bool = False
     ) -> None:
         self.log.info(f"Saving job_return in {output_dir / filename}")
         output_dir.mkdir(parents=True, exist_ok=True)
@@ -236,23 +354,43 @@ class SaveJobReturnValueCallback(Callback):
         elif filename.endswith(".md"):
             # Convert PyTorch tensors and numpy arrays to native python types
             obj_py = to_py_obj(obj)
             obj_py_flat = flatten_dict(obj_py)
-            if multi_run_result:
-                # In the case of multi-run, we expect to have multiple values for each key.
-                # We therefore just convert the dict to a pandas DataFrame.
                 result = pd.DataFrame(obj_py_flat)
             else:
-                # In the case of a single job, we expect to have only one value for each key.
-                # We therefore convert the dict to a pandas Series and ...
                 series = pd.Series(obj_py_flat)
-                if len(series.index.levels) > 1:
-                    # ... if the Series has multiple index levels, we create a DataFrame by unstacking the last level.
-                    result = series.unstack(-1)
-                else:
-                    # ... otherwise we just unpack the one-entry index values and save the resulting Series.
                     series.index = series.index.get_level_values(0)
                     result = series
             with open(str(output_dir / filename), "w") as file:
                 file.write(result.to_markdown())

 import os
 import pickle
 from pathlib import Path
+from typing import Any, Dict, Generator, List, Optional, Tuple, Union
 import numpy as np
 import pandas as pd
 def list_of_dicts_to_dict_of_lists_recursive(list_of_dicts):
     """Convert a list of dicts to a dict of lists recursively.
+    Examples:
         # works with nested dicts
         >>> list_of_dicts_to_dict_of_lists_recursive([{"a": 1, "b": {"c": 2}}, {"a": 3, "b": {"c": 4}}])
+        {'a': [1, 3], 'b': {'c': [2, 4]}}
         # works with incomplete dicts
         >>> list_of_dicts_to_dict_of_lists_recursive([{"a": 1, "b": 2}, {"a": 3}])
+        {'a': [1, 3], 'b': [2, None]}
+        # works with nested incomplete dicts
+        >>> list_of_dicts_to_dict_of_lists_recursive([{"a": 1, "b": {"c": 2}}, {"a": 3}])
+        {'a': [1, 3], 'b': {'c': [2, None]}}
+        # works with nested incomplete dicts with None values
+        >>> list_of_dicts_to_dict_of_lists_recursive([{"a": 1, "b": {"c": 2}}, {"a": None}])
+        {'a': [1, None], 'b': {'c': [2, None]}}
     Args:
         list_of_dicts (List[dict]): A list of dicts.
     Returns:
+        dict: An arbitrarily nested dict of lists.
     """
+    if not list_of_dicts:
+        return {}
+    # Check if all elements are either None or dictionaries
+    if all(d is None or isinstance(d, dict) for d in list_of_dicts):
+        # Gather all keys from non-None dictionaries
+        keys = set()
+        for d in list_of_dicts:
+            if d is not None:
                 keys.update(d.keys())
+        # Build up the result recursively
+        return {
+            k: list_of_dicts_to_dict_of_lists_recursive(
+                [(d[k] if d is not None and k in d else None) for d in list_of_dicts]
+            )
+            for k in keys
+        }
     else:
+        # If items are not all dict/None, just return the list as is (base case).
         return list_of_dicts
             yield new_key, v
+def flatten_dict(d: Dict[str, Any], pad_keys: bool = True) -> Dict[Tuple[str, ...], Any]:
+    """Flattens a dictionary with nested keys. Per default, the keys are padded with np.nan to have
+    the same length.
+    Example:
+        >>> d = {'a': {'b': {'c': 1, 'd': 2}, 'e': 3}}
+        >>> flatten_dict(d)
+        {('a', 'b', 'c'): 1, ('a', 'b', 'd'): 2, ('a', 'e', np.nan): 3}
+        # with padding the keys
+        >>> d = {'a': {'b': {'c': 1, 'd': 2}, 'e': 3}}
+        >>> flatten_dict(d, pad_keys=False)
+        {('a', 'b', 'c'): 1, ('a', 'b', 'd'): 2, ('a', 'e'): 3}
+    """
+    result = dict(_flatten_dict_gen(d))
+    # pad the keys with np.nan to have the same length. We use np.nan to be pandas-friendly.
+    if pad_keys:
+        max_num_keys = max(len(k) for k in result.keys())
+        result = {
+            tuple(list(k) + [np.nan] * (max_num_keys - len(k))): v for k, v in result.items()
+        }
+    return result
+def unflatten_dict(
+    d: Dict[Tuple[str, ...], Any], unpad_keys: bool = True
+) -> Union[Dict[str, Any], Any]:
+    """Unflattens a dictionary with nested keys. Per default, the keys are unpadded by removing
+    np.nan values.
     Example:
         >>> d = {("a", "b", "c"): 1, ("a", "b", "d"): 2, ("a", "e"): 3}
         >>> unflatten_dict(d)
         {'a': {'b': {'c': 1, 'd': 2}, 'e': 3}}
+        # with unpad the keys
+        >>> d = {("a", "b", "c"): 1, ("a", "b", "d"): 2, ("a", "e", np.nan): 3}
+        >>> unflatten_dict(d)
+        {'a': {'b': {'c': 1, 'd': 2}, 'e': 3}}
     """
     result: Dict[str, Any] = {}
     for k, v in d.items():
+        if unpad_keys:
+            k = tuple([ki for ki in k if not pd.isna(ki)])
         if len(k) == 0:
             if len(result) > 1:
                 raise ValueError("Cannot unflatten dictionary with multiple root keys.")
         nested), where the keys are the keys of the job return-values and the values are lists of the corresponding
         values of all jobs. This is useful if you want to access specific values of all jobs in a multi-run all at once.
         Also, aggregated values (e.g. mean, min, max) are created for all numeric values and saved in another file.
+    multirun_aggregator_blacklist: List[str] (default: None)
+        A list of keys to exclude from the aggregation (of multirun results), such as "count" or "25%". If None,
+        all keys are included. See pd.DataFrame.describe() for possible aggregation keys.
+        For numeric values, it is recommended to use ["min", "25%", "50%", "75%", "max"]
+        which will result in keeping only the count, mean and std values.
+    multirun_create_ids_from_overrides: bool (default: True)
+        Create job identifiers from the overrides of the jobs in a multi-run. If False, the job index is used as
+        identifier.
+    markdown_round_digits: int (default: 3)
+        The number of digits to round the values in the markdown file. If None, no rounding is applied.
+    multirun_job_id_key: str (default: "job_id")
+        The key to use for the job identifiers in the integrated multi-run result.
+    paths_file: str (default: None)
+        The file to save the paths of the log directories to. If None, the paths are not saved.
+    path_id: str (default: None)
+        A prefix to add to each line in the paths_file separated by a colon. If None, no prefix is added.
+    multirun_paths_file: str (default: None)
+        The file to save the paths of the multi-run log directories to. If None, the paths are not saved.
+    multirun_path_id: str (default: None)
+        A prefix to add to each line in the multirun_paths_file separated by a colon. If None, no prefix is added.
     """
     def __init__(
         self,
         filenames: Union[str, List[str]] = "job_return_value.json",
         integrate_multirun_result: bool = False,
+        multirun_aggregator_blacklist: Optional[List[str]] = None,
+        multirun_create_ids_from_overrides: bool = True,
+        markdown_round_digits: Optional[int] = 3,
+        multirun_job_id_key: str = "job_id",
+        paths_file: Optional[str] = None,
+        path_id: Optional[str] = None,
+        multirun_paths_file: Optional[str] = None,
+        multirun_path_id: Optional[str] = None,
     ) -> None:
         self.log = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
         self.filenames = [filenames] if isinstance(filenames, str) else filenames
         self.integrate_multirun_result = integrate_multirun_result
         self.job_returns: List[JobReturn] = []
+        self.multirun_aggregator_blacklist = multirun_aggregator_blacklist
+        self.multirun_create_ids_from_overrides = multirun_create_ids_from_overrides
+        self.multirun_job_id_key = multirun_job_id_key
+        self.markdown_round_digits = markdown_round_digits
+        self.multirun_paths_file = multirun_paths_file
+        self.multirun_path_id = multirun_path_id
+        self.paths_file = paths_file
+        self.path_id = path_id
     def on_job_end(self, config: DictConfig, job_return: JobReturn, **kwargs: Any) -> None:
         self.job_returns.append(job_return)
         output_dir = Path(config.hydra.runtime.output_dir)  # / Path(config.hydra.output_subdir)
+        if self.paths_file is not None:
+            # append the output_dir to the file
+            with open(self.paths_file, "a") as file:
+                file.write(f"{output_dir}\n")
         for filename in self.filenames:
             self._save(obj=job_return.return_value, filename=filename, output_dir=output_dir)
     def on_multirun_end(self, config: DictConfig, **kwargs: Any) -> None:
+        job_ids: Union[List[str], List[int]]
+        if self.multirun_create_ids_from_overrides:
+            job_ids = overrides_to_identifiers([jr.overrides for jr in self.job_returns])
+        else:
+            job_ids = list(range(len(self.job_returns)))
         if self.integrate_multirun_result:
             # rearrange the job return-values of all jobs from a multi-run into a dict of lists (maybe nested),
             obj = list_of_dicts_to_dict_of_lists_recursive(
                 [jr.return_value for jr in self.job_returns]
             )
+            if not isinstance(obj, dict):
+                obj = {"value": obj}
+            if self.multirun_create_ids_from_overrides:
+                obj[self.multirun_job_id_key] = job_ids
             # also create an aggregated result
             # convert to python object to allow selecting numeric columns
             obj_py = to_py_obj(obj)
             else:
                 # aggregate the numeric values
                 df_described = df_numbers_only.describe()
+                # remove rows in the blacklist
+                if self.multirun_aggregator_blacklist is not None:
+                    df_described = df_described.drop(
+                        self.multirun_aggregator_blacklist, errors="ignore", axis="index"
+                    )
                 # add the aggregation keys (e.g. mean, min, ...) as most inner keys and convert back to dict
                 obj_flat_aggregated = df_described.T.stack().to_dict()
                 # unflatten because _save() works better with nested dicts
         else:
             # create a dict of the job return-values of all jobs from a multi-run
             # (_save() works better with nested dicts)
+            obj = {
+                identifier: jr.return_value for identifier, jr in zip(job_ids, self.job_returns)
+            }
             obj_aggregated = None
         output_dir = Path(config.hydra.sweep.dir)
+        if self.multirun_paths_file is not None:
+            # append the output_dir to the file
+            line = f"{output_dir}\n"
+            if self.multirun_path_id is not None:
+                line = f"{self.multirun_path_id}:{line}"
+            with open(self.multirun_paths_file, "a") as file:
+                file.write(line)
         for filename in self.filenames:
             self._save(
                 obj=obj,
                 filename=filename,
                 output_dir=output_dir,
+                is_tabular_data=self.integrate_multirun_result,
             )
             # if available, also save the aggregated result
             if obj_aggregated is not None:
                 file_base_name, ext = os.path.splitext(filename)
                 filename_aggregated = f"{file_base_name}.aggregated{ext}"
+                self._save(
+                    obj=obj_aggregated,
+                    filename=filename_aggregated,
+                    output_dir=output_dir,
+                    # If we have aggregated (integrated multi-run) results, we unstack the last level,
+                    # i.e. the aggregation key.
+                    unstack_last_index_level=True,
+                )
     def _save(
+        self,
+        obj: Any,
+        filename: str,
+        output_dir: Path,
+        is_tabular_data: bool = False,
+        unstack_last_index_level: bool = False,
     ) -> None:
         self.log.info(f"Saving job_return in {output_dir / filename}")
         output_dir.mkdir(parents=True, exist_ok=True)
         elif filename.endswith(".md"):
             # Convert PyTorch tensors and numpy arrays to native python types
             obj_py = to_py_obj(obj)
+            if not isinstance(obj_py, dict):
+                obj_py = {"value": obj_py}
             obj_py_flat = flatten_dict(obj_py)
+            if is_tabular_data:
+                # In the case of (not aggregated) integrated multi-run result, we expect to have
+                # multiple values for each key. We therefore just convert the dict to a pandas DataFrame.
                 result = pd.DataFrame(obj_py_flat)
+                job_id_column = (self.multirun_job_id_key,) + (np.nan,) * (
+                    result.columns.nlevels - 1
+                )
+                if job_id_column in result.columns:
+                    result = result.set_index(job_id_column)
+                    result.index.name = self.multirun_job_id_key
             else:
+                # Otherwise, we have only one value for each key. We convert the dict to a pandas Series.
                 series = pd.Series(obj_py_flat)
+                # The series has a MultiIndex because flatten_dict() uses a tuple as key.
+                if len(series.index.levels) <= 1:
+                    # If there is only one level, we just use the first level values as index.
                     series.index = series.index.get_level_values(0)
                     result = series
+                else:
+                    # If there are multiple levels, we unstack the series to get a DataFrame
+                    # providing a better overview.
+                    if unstack_last_index_level:
+                        # If we have aggregated (integrated multi-run) results, we unstack the last level,
+                        # i.e. the aggregation key.
+                        result = series.unstack(-1)
+                    else:
+                        # Otherwise we have a default multi-run result and unstack the first level,
+                        # i.e. the identifier created from the overrides, and transpose the result
+                        # to have the individual jobs as rows.
+                        result = series.unstack(0).T
+            if self.markdown_round_digits is not None:
+                result = result.round(self.markdown_round_digits)
             with open(str(output_dir / filename), "w") as file:
                 file.write(result.to_markdown())

src/langchain_modules/pie_document_store.py CHANGED Viewed

@@ -75,7 +75,7 @@ class PieDocumentStore(SerializableStore, BaseStore[str, LCDocument], abc.ABC):
                 caption: pie_document[layer_name] for layer_name, caption in layer_captions.items()
             }
             layer_sizes = {
-                f"num_{caption}s": len(layer) + (len(layer.predictions) if use_predictions else 0)
                 for caption, layer in layers.items()
             }
             rows.append({"doc_id": doc_id, **layer_sizes})

                 caption: pie_document[layer_name] for layer_name, caption in layer_captions.items()
             }
             layer_sizes = {
+                f"num_{caption}": len(layer) + (len(layer.predictions) if use_predictions else 0)
                 for caption, layer in layers.items()
             }
             rows.append({"doc_id": doc_id, **layer_sizes})

src/langchain_modules/span_retriever.py CHANGED Viewed

@@ -23,6 +23,7 @@ from pytorch_ie.documents import (
     TextDocumentWithSpans,
 )
 from .pie_document_store import PieDocumentStore
 from .serializable_store import SerializableStore
 from .span_vectorstore import SpanVectorStore
@@ -30,20 +31,6 @@ from .span_vectorstore import SpanVectorStore
 logger = logging.getLogger(__name__)
-def _parse_config(config_string: str, format: str) -> Dict[str, Any]:
-    """Parse a configuration string."""
-    if format == "json":
-        import json
-        return json.loads(config_string)
-    elif format == "yaml":
-        import yaml
-        return yaml.safe_load(config_string)
-    else:
-        raise ValueError(f"Unsupported format: {format}. Use 'json' or 'yaml'.")
 METADATA_KEY_CHILD_ID2IDX = "child_id2idx"
@@ -136,7 +123,7 @@ class DocumentAwareSpanRetriever(BaseRetriever, SerializableStore):
     ) -> "DocumentAwareSpanRetriever":
         """Instantiate a retriever from a configuration string."""
         return cls.instantiate_from_config(
-            _parse_config(config_string, format=format), overwrites=overwrites
         )
     @classmethod
@@ -725,6 +712,8 @@ class DocumentAwareSpanRetrieverWithRelations(DocumentAwareSpanRetriever):
     """The list of span labels to consider."""
     reversed_relations_suffix: Optional[str] = None
     """Whether to consider reverse relations as well."""
     def get_relation_layer(
         self, pie_document: TextBasedDocument, use_predicted_annotations: bool
@@ -762,11 +751,19 @@ class DocumentAwareSpanRetrieverWithRelations(DocumentAwareSpanRetriever):
             )
             for relation in relations:
                 if self.relation_labels is None or relation.label in self.relation_labels:
                     head2label2tails_with_scores[span2id[relation.head]][relation.label].append(
                         (span2id[relation.tail], relation.score)
                     )
-                if self.reversed_relations_suffix is not None:
                     reversed_label = f"{relation.label}{self.reversed_relations_suffix}"
                     if self.relation_labels is None or reversed_label in self.relation_labels:
                         head2label2tails_with_scores[span2id[relation.tail]][

     TextDocumentWithSpans,
 )
+from ..utils import parse_config
 from .pie_document_store import PieDocumentStore
 from .serializable_store import SerializableStore
 from .span_vectorstore import SpanVectorStore
 logger = logging.getLogger(__name__)
 METADATA_KEY_CHILD_ID2IDX = "child_id2idx"
     ) -> "DocumentAwareSpanRetriever":
         """Instantiate a retriever from a configuration string."""
         return cls.instantiate_from_config(
+            parse_config(config_string, format=format), overwrites=overwrites
         )
     @classmethod
     """The list of span labels to consider."""
     reversed_relations_suffix: Optional[str] = None
     """Whether to consider reverse relations as well."""
+    symmetric_relations: Optional[list[str]] = None
+    """The list of relation labels that are symmetric."""
     def get_relation_layer(
         self, pie_document: TextBasedDocument, use_predicted_annotations: bool
             )
             for relation in relations:
+                is_symmetric = (
+                    self.symmetric_relations is not None
+                    and relation.label in self.symmetric_relations
+                )
                 if self.relation_labels is None or relation.label in self.relation_labels:
                     head2label2tails_with_scores[span2id[relation.head]][relation.label].append(
                         (span2id[relation.tail], relation.score)
                     )
+                    if is_symmetric:
+                        head2label2tails_with_scores[span2id[relation.tail]][
+                            relation.label
+                        ].append((span2id[relation.head], relation.score))
+                if self.reversed_relations_suffix is not None and not is_symmetric:
                     reversed_label = f"{relation.label}{self.reversed_relations_suffix}"
                     if self.relation_labels is None or reversed_label in self.relation_labels:
                         head2label2tails_with_scores[span2id[relation.tail]][

src/pipeline/ner_re_pipeline.py CHANGED Viewed

@@ -2,7 +2,18 @@ from __future__ import annotations
 import logging
 from functools import partial
-from typing import Callable, Dict, Iterable, List, Optional, Sequence, Type, TypeVar, Union
 from pie_modules.utils import resolve_type
 from pytorch_ie import AutoPipeline, WithDocumentTypeMixin
@@ -72,11 +83,13 @@ def add_annotations_from_other_documents(
 def process_pipeline_steps(
     documents: Sequence[Document],
     processors: Dict[str, Callable[[Sequence[Document]], Optional[Sequence[Document]]]],
 ) -> Sequence[Document]:
     # call the processors in the order they are provided
     for step_name, processor in processors.items():
-        logger.info(f"process {step_name} ...")
         processed_documents = processor(documents)
         if processed_documents is not None:
             documents = processed_documents
@@ -120,6 +133,7 @@ class NerRePipeline:
         batch_size: Optional[int] = None,
         show_progress_bar: Optional[bool] = None,
         document_type: Optional[Union[Type[Document], str]] = None,
         **processor_kwargs,
     ):
         self.taskmodule = DummyTaskmodule(document_type)
@@ -128,6 +142,7 @@ class NerRePipeline:
         self.processor_kwargs = processor_kwargs or {}
         self.entity_layer = entity_layer
         self.relation_layer = relation_layer
         # set some values for the inference processors, if provided
         for inference_pipeline in ["ner_pipeline", "re_pipeline"]:
             if inference_pipeline not in self.processor_kwargs:
@@ -145,7 +160,29 @@ class NerRePipeline:
             ):
                 self.processor_kwargs[inference_pipeline]["show_progress_bar"] = show_progress_bar
-    def __call__(self, documents: Sequence[Document], inplace: bool = False) -> Sequence[Document]:
         input_docs: Sequence[Document]
         # we need to keep the original documents to add the gold data back
@@ -166,24 +203,14 @@ class NerRePipeline:
                     layer_names=[self.entity_layer, self.relation_layer],
                     **self.processor_kwargs.get("clear_annotations", {}),
                 ),
-                "ner_pipeline": AutoPipeline.from_pretrained(
-                    self.ner_model_path, **self.processor_kwargs.get("ner_pipeline", {})
-                ),
                 "use_predicted_entities": partial(
                     process_documents,
                     processor=move_annotations_from_predictions,
                     layer_names=[self.entity_layer],
                     **self.processor_kwargs.get("use_predicted_entities", {}),
                 ),
-                # "create_candidate_relations": partial(
-                #    process_documents,
-                #    processor=CandidateRelationAdder(
-                #        **self.processor_kwargs.get("create_candidate_relations", {})
-                #    ),
-                # ),
-                "re_pipeline": AutoPipeline.from_pretrained(
-                    self.re_model_path, **self.processor_kwargs.get("re_pipeline", {})
-                ),
                 # otherwise we can not move the entities back to predictions
                 "clear_candidate_relations": partial(
                     process_documents,
@@ -204,5 +231,8 @@ class NerRePipeline:
                     **self.processor_kwargs.get("re_add_gold_data", {}),
                 ),
             },
         )
         return docs_with_predictions

 import logging
 from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Type,
+    TypeVar,
+    Union,
+    overload,
+)
 from pie_modules.utils import resolve_type
 from pytorch_ie import AutoPipeline, WithDocumentTypeMixin
 def process_pipeline_steps(
     documents: Sequence[Document],
     processors: Dict[str, Callable[[Sequence[Document]], Optional[Sequence[Document]]]],
+    verbose: bool = False,
 ) -> Sequence[Document]:
     # call the processors in the order they are provided
     for step_name, processor in processors.items():
+        if verbose:
+            logger.info(f"process {step_name} ...")
         processed_documents = processor(documents)
         if processed_documents is not None:
             documents = processed_documents
         batch_size: Optional[int] = None,
         show_progress_bar: Optional[bool] = None,
         document_type: Optional[Union[Type[Document], str]] = None,
+        verbose: bool = True,
         **processor_kwargs,
     ):
         self.taskmodule = DummyTaskmodule(document_type)
         self.processor_kwargs = processor_kwargs or {}
         self.entity_layer = entity_layer
         self.relation_layer = relation_layer
+        self.verbose = verbose
         # set some values for the inference processors, if provided
         for inference_pipeline in ["ner_pipeline", "re_pipeline"]:
             if inference_pipeline not in self.processor_kwargs:
             ):
                 self.processor_kwargs[inference_pipeline]["show_progress_bar"] = show_progress_bar
+        self.ner_pipeline = AutoPipeline.from_pretrained(
+            self.ner_model_path, **self.processor_kwargs.get("ner_pipeline", {})
+        )
+        self.re_pipeline = AutoPipeline.from_pretrained(
+            self.re_model_path, **self.processor_kwargs.get("re_pipeline", {})
+        )
+    @overload
+    def __call__(
+        self, documents: Sequence[Document], inplace: bool = False
+    ) -> Sequence[Document]: ...
+    @overload
+    def __call__(self, documents: Document, inplace: bool = False) -> Document: ...
+    def __call__(
+        self, documents: Union[Sequence[Document], Document], inplace: bool = False
+    ) -> Union[Sequence[Document], Document]:
+        is_single_doc = False
+        if isinstance(documents, Document):
+            documents = [documents]
+            is_single_doc = True
         input_docs: Sequence[Document]
         # we need to keep the original documents to add the gold data back
                     layer_names=[self.entity_layer, self.relation_layer],
                     **self.processor_kwargs.get("clear_annotations", {}),
                 ),
+                "ner_pipeline": self.ner_pipeline,
                 "use_predicted_entities": partial(
                     process_documents,
                     processor=move_annotations_from_predictions,
                     layer_names=[self.entity_layer],
                     **self.processor_kwargs.get("use_predicted_entities", {}),
                 ),
+                "re_pipeline": self.re_pipeline,
                 # otherwise we can not move the entities back to predictions
                 "clear_candidate_relations": partial(
                     process_documents,
                     **self.processor_kwargs.get("re_add_gold_data", {}),
                 ),
             },
+            verbose=self.verbose,
         )
+        if is_single_doc:
+            return docs_with_predictions[0]
         return docs_with_predictions

src/predict.py CHANGED Viewed

@@ -106,8 +106,12 @@ def predict(cfg: DictConfig) -> Tuple[dict, dict]:
         # Per default, the model is loaded with .from_pretrained() which already loads the weights.
         # However, ckpt_path can be used to load different weights from any checkpoint.
         if cfg.ckpt_path is not None:
-            pipeline.model = pipeline.model.load_from_checkpoint(checkpoint_path=cfg.ckpt_path).to(
-                pipeline.device
             )
         # auto-convert the dataset if the metric specifies a document type

         # Per default, the model is loaded with .from_pretrained() which already loads the weights.
         # However, ckpt_path can be used to load different weights from any checkpoint.
         if cfg.ckpt_path is not None:
+            log.info(f"Loading model weights from checkpoint: {cfg.ckpt_path}")
+            pipeline.model = (
+                type(pipeline.model)
+                .load_from_checkpoint(checkpoint_path=cfg.ckpt_path)
+                .to(pipeline.device)
+                .to(dtype=pipeline.model.dtype)
             )
         # auto-convert the dataset if the metric specifies a document type

src/start_demo.py CHANGED Viewed

@@ -19,8 +19,10 @@ import yaml
 from src.demo.annotation_utils import load_argumentation_model
 from src.demo.backend_utils import (
     download_processed_documents,
     process_text_from_arxiv,
     process_uploaded_files,
     render_annotated_document,
     upload_processed_documents,
     wrapped_add_annotated_pie_documents_from_dataset,
@@ -31,6 +33,7 @@ from src.demo.frontend_utils import (
     escape_regex,
     get_cell_for_fixed_column_from_df,
     open_accordion,
     unescape_regex,
 )
 from src.demo.rendering_utils import AVAILABLE_RENDER_MODES, HIGHLIGHT_SPANS_JS
@@ -67,12 +70,10 @@ def main(cfg: DictConfig) -> None:
     example_text = cfg["example_text"]
-    default_device = "cuda" if torch.cuda.is_available() else "cpu"
-    default_retriever_config_str = load_yaml_config(cfg["default_retriever_config_path"])
-    default_model_name = cfg["default_model_name"]
-    default_model_revision = cfg["default_model_revision"]
     handle_parts_of_same = cfg["handle_parts_of_same"]
     default_arxiv_id = cfg["default_arxiv_id"]
@@ -97,19 +98,32 @@ def main(cfg: DictConfig) -> None:
     }
     render_caption2mode = {v: k for k, v in render_mode2caption.items()}
     default_min_similarity = cfg["default_min_similarity"]
     layer_caption_mapping = cfg["layer_caption_mapping"]
     relation_name_mapping = cfg["relation_name_mapping"]
     gr.Info("Loading models ...")
     argumentation_model = load_argumentation_model(
-        model_name=default_model_name,
-        revision=default_model_revision,
         device=default_device,
     )
     retriever = load_retriever(
-        default_retriever_config_str, device=default_device, config_format="yaml"
     )
     with gr.Blocks() as demo:
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
         # models_state = gr.State((argumentation_model, embedding_model))
@@ -131,18 +145,16 @@ def main(cfg: DictConfig) -> None:
                     with gr.Accordion("Model Configuration", open=False):
                         with gr.Accordion("argumentation structure", open=True):
-                            model_name = gr.Textbox(
-                                label="Model Name",
-                                value=default_model_name,
-                            )
-                            model_revision = gr.Textbox(
-                                label="Model Revision",
-                                value=default_model_revision,
                             )
                             load_arg_model_btn = gr.Button("Load Argumentation Model")
                         with gr.Accordion("retriever", open=True):
-                            retriever_config = gr.Code(
                                 language="yaml",
                                 label="Retriever Configuration",
                                 value=default_retriever_config_str,
@@ -155,26 +167,25 @@ def main(cfg: DictConfig) -> None:
                             value=default_device,
                         )
                         load_arg_model_btn.click(
-                            fn=lambda _model_name, _model_revision, _device: (
                                 load_argumentation_model(
-                                    model_name=_model_name,
-                                    revision=_model_revision,
                                     device=_device,
                                 ),
                             ),
-                            inputs=[model_name, model_revision, device],
                             outputs=argumentation_model_state,
                         )
                         load_retriever_btn.click(
                             fn=lambda _retriever_config, _device, _previous_retriever: (
                                 load_retriever(
-                                    retriever_config_str=_retriever_config,
                                     device=_device,
                                     previous_retriever=_previous_retriever[0],
                                     config_format="yaml",
                                 ),
                             ),
-                            inputs=[retriever_config, device, retriever_state],
                             outputs=retriever_state,
                         )
@@ -213,7 +224,7 @@ def main(cfg: DictConfig) -> None:
             with gr.Tabs() as right_tabs:
                 with gr.Tab("Retrieval", id="retrieval") as retrieval_tab:
                     with gr.Accordion(
-                        "Indexed Documents", open=False
                     ) as processed_documents_accordion:
                         processed_documents_df = gr.DataFrame(
                             headers=["id", "num_adus", "num_relations"],
@@ -274,7 +285,7 @@ def main(cfg: DictConfig) -> None:
                             minimum=2,
                             maximum=50,
                             step=1,
-                            value=10,
                         )
                         retrieve_similar_adus_btn = gr.Button(
                             "Retrieve *similar* ADUs for *selected* ADU"
@@ -293,8 +304,10 @@ def main(cfg: DictConfig) -> None:
                             "Retrieve *relevant* ADUs for *all* ADUs in the document"
                         )
                         all_relevant_adus_df = gr.DataFrame(
-                            headers=["doc_id", "adu_id", "score", "text"], interactive=False
                         )
                 with gr.Tab("Import Documents", id="import_documents") as import_documents_tab:
                     upload_btn = gr.UploadButton(
@@ -303,6 +316,28 @@ def main(cfg: DictConfig) -> None:
                         file_count="multiple",
                     )
                     with gr.Accordion("Import text from arXiv", open=False):
                         arxiv_id = gr.Textbox(
                             label="arXiv paper ID",
@@ -326,13 +361,25 @@ def main(cfg: DictConfig) -> None:
                         load_pie_dataset_btn = gr.Button("Load & Embed PIE Dataset")
         render_event_kwargs = dict(
-            fn=lambda _retriever, _document_id, _render_as, _render_kwargs: render_annotated_document(
                 retriever=_retriever[0],
                 document_id=_document_id,
                 render_with=render_caption2mode[_render_as],
                 render_kwargs_json=_render_kwargs,
             ),
-            inputs=[retriever_state, selected_document_id, render_as, render_kwargs],
             outputs=rendered_output,
         )
@@ -343,6 +390,16 @@ def main(cfg: DictConfig) -> None:
             inputs=[retriever_state],
             outputs=[processed_documents_df],
         )
         predict_btn.click(
             fn=lambda: change_tab(analysed_document_tab.id), inputs=[], outputs=[left_tabs]
         ).then(
@@ -367,6 +424,8 @@ def main(cfg: DictConfig) -> None:
             api_name="predict",
         ).success(
             **show_overview_kwargs
         ).success(
             **render_event_kwargs
         )
@@ -396,6 +455,8 @@ def main(cfg: DictConfig) -> None:
             api_name="predict",
         ).success(
             **show_overview_kwargs
         )
         load_pie_dataset_btn.click(
@@ -409,6 +470,8 @@ def main(cfg: DictConfig) -> None:
             ),
             inputs=[retriever_state, load_pie_dataset_kwargs_str],
             outputs=[processed_documents_df],
         )
         selected_document_id.change(
@@ -430,7 +493,9 @@ def main(cfg: DictConfig) -> None:
                 file_names=_file_names,
                 argumentation_model=_argumentation_model[0],
                 retriever=_retriever[0],
-                split_regex_escaped=unescape_regex(_split_regex_escaped),
                 handle_parts_of_same=handle_parts_of_same,
                 layer_captions=layer_caption_mapping,
             ),
@@ -441,7 +506,61 @@ def main(cfg: DictConfig) -> None:
                 split_regex_escaped,
             ],
             outputs=[processed_documents_df],
         )
         processed_documents_df.select(
             fn=get_cell_for_fixed_column_from_df,
             inputs=[processed_documents_df, gr.State("doc_id")],
@@ -461,7 +580,7 @@ def main(cfg: DictConfig) -> None:
             ),
             inputs=[upload_processed_documents_btn, retriever_state],
             outputs=[processed_documents_df],
-        )
         retrieve_relevant_adus_event_kwargs = dict(
             fn=lambda _retriever, _selected_adu_id, _min_similarity, _top_k: retrieve_relevant_spans(
@@ -533,12 +652,16 @@ def main(cfg: DictConfig) -> None:
         )
         retrieve_all_relevant_adus_btn.click(
-            fn=lambda _retriever, _document_id, _min_similarity, _tok_k: retrieve_all_relevant_spans(
-                retriever=_retriever[0],
-                query_doc_id=_document_id,
-                k=_tok_k,
-                score_threshold=_min_similarity,
-                query_span_id_column="query_span_id",
             ),
             inputs=[
                 retriever_state,
@@ -546,9 +669,11 @@ def main(cfg: DictConfig) -> None:
                 min_similarity,
                 top_k,
             ],
-            outputs=[all_relevant_adus_df],
         )
         # select query span id from the "retrieve all" result data frames
         all_similar_adus_df.select(
             fn=get_cell_for_fixed_column_from_df,

 from src.demo.annotation_utils import load_argumentation_model
 from src.demo.backend_utils import (
     download_processed_documents,
+    load_acl_anthology_venues,
     process_text_from_arxiv,
     process_uploaded_files,
+    process_uploaded_pdf_files,
     render_annotated_document,
     upload_processed_documents,
     wrapped_add_annotated_pie_documents_from_dataset,
     escape_regex,
     get_cell_for_fixed_column_from_df,
     open_accordion,
+    open_accordion_with_stats,
     unescape_regex,
 )
 from src.demo.rendering_utils import AVAILABLE_RENDER_MODES, HIGHLIGHT_SPANS_JS
     example_text = cfg["example_text"]
+    default_device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    default_retriever_config_str = yaml.dump(cfg["retriever"])
+    default_argumentation_model_config_str = yaml.dump(cfg["argumentation_model"])
     handle_parts_of_same = cfg["handle_parts_of_same"]
     default_arxiv_id = cfg["default_arxiv_id"]
     }
     render_caption2mode = {v: k for k, v in render_mode2caption.items()}
     default_min_similarity = cfg["default_min_similarity"]
+    default_top_k = cfg["default_top_k"]
     layer_caption_mapping = cfg["layer_caption_mapping"]
     relation_name_mapping = cfg["relation_name_mapping"]
+    indexed_documents_label = "Indexed Documents"
+    indexed_documents_caption2column = {
+        "documents": "TOTAL",
+        "ADUs": "num_adus",
+        "Relations": "num_relations",
+    }
     gr.Info("Loading models ...")
     argumentation_model = load_argumentation_model(
+        config_str=default_argumentation_model_config_str,
         device=default_device,
     )
     retriever = load_retriever(
+        config_str=default_retriever_config_str, device=default_device, config_format="yaml"
     )
+    if cfg.get("pdf_fulltext_extractor"):
+        gr.Info("Loading PDF fulltext extractor ...")
+        pdf_fulltext_extractor = hydra.utils.instantiate(cfg["pdf_fulltext_extractor"])
+    else:
+        pdf_fulltext_extractor = None
     with gr.Blocks() as demo:
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
         # models_state = gr.State((argumentation_model, embedding_model))
                     with gr.Accordion("Model Configuration", open=False):
                         with gr.Accordion("argumentation structure", open=True):
+                            argumentation_model_config_str = gr.Code(
+                                language="yaml",
+                                label="Argumentation Model Configuration",
+                                value=default_argumentation_model_config_str,
+                                lines=len(default_argumentation_model_config_str.split("\n")),
                             )
                             load_arg_model_btn = gr.Button("Load Argumentation Model")
                         with gr.Accordion("retriever", open=True):
+                            retriever_config_str = gr.Code(
                                 language="yaml",
                                 label="Retriever Configuration",
                                 value=default_retriever_config_str,
                             value=default_device,
                         )
                         load_arg_model_btn.click(
+                            fn=lambda _argumentation_model_config_str, _device: (
                                 load_argumentation_model(
+                                    config_str=_argumentation_model_config_str,
                                     device=_device,
                                 ),
                             ),
+                            inputs=[argumentation_model_config_str, device],
                             outputs=argumentation_model_state,
                         )
                         load_retriever_btn.click(
                             fn=lambda _retriever_config, _device, _previous_retriever: (
                                 load_retriever(
+                                    config_str=_retriever_config,
                                     device=_device,
                                     previous_retriever=_previous_retriever[0],
                                     config_format="yaml",
                                 ),
                             ),
+                            inputs=[retriever_config_str, device, retriever_state],
                             outputs=retriever_state,
                         )
             with gr.Tabs() as right_tabs:
                 with gr.Tab("Retrieval", id="retrieval") as retrieval_tab:
                     with gr.Accordion(
+                        indexed_documents_label, open=False
                     ) as processed_documents_accordion:
                         processed_documents_df = gr.DataFrame(
                             headers=["id", "num_adus", "num_relations"],
                             minimum=2,
                             maximum=50,
                             step=1,
+                            value=default_top_k,
                         )
                         retrieve_similar_adus_btn = gr.Button(
                             "Retrieve *similar* ADUs for *selected* ADU"
                             "Retrieve *relevant* ADUs for *all* ADUs in the document"
                         )
                         all_relevant_adus_df = gr.DataFrame(
+                            headers=["doc_id", "adu_id", "score", "text", "query_span_id"],
+                            interactive=False,
                         )
+                        all_relevant_adus_query_doc_id = gr.Textbox(visible=False)
                 with gr.Tab("Import Documents", id="import_documents") as import_documents_tab:
                     upload_btn = gr.UploadButton(
                         file_count="multiple",
                     )
+                    upload_pdf_btn = gr.UploadButton(
+                        "Batch Analyse PDFs",
+                        # file_types=["pdf"],
+                        file_count="multiple",
+                        visible=pdf_fulltext_extractor is not None,
+                    )
+                    enable_acl_venue_loading = pdf_fulltext_extractor is not None and cfg.get(
+                        "acl_anthology_pdf_dir"
+                    )
+                    acl_anthology_venues = gr.Textbox(
+                        label="ACL Anthology Venues",
+                        value="wiesp",
+                        max_lines=1,
+                        visible=enable_acl_venue_loading,
+                    )
+                    load_acl_anthology_venues_btn = gr.Button(
+                        "Import from ACL Anthology",
+                        variant="secondary",
+                        visible=enable_acl_venue_loading,
+                    )
                     with gr.Accordion("Import text from arXiv", open=False):
                         arxiv_id = gr.Textbox(
                             label="arXiv paper ID",
                         load_pie_dataset_btn = gr.Button("Load & Embed PIE Dataset")
         render_event_kwargs = dict(
+            fn=lambda _retriever, _document_id, _render_as, _render_kwargs, _all_relevant_adus_df, _all_relevant_adus_query_doc_id: render_annotated_document(
                 retriever=_retriever[0],
                 document_id=_document_id,
                 render_with=render_caption2mode[_render_as],
                 render_kwargs_json=_render_kwargs,
+                highlight_span_ids=(
+                    _all_relevant_adus_df["query_span_id"].tolist()
+                    if _document_id == _all_relevant_adus_query_doc_id
+                    else None
+                ),
             ),
+            inputs=[
+                retriever_state,
+                selected_document_id,
+                render_as,
+                render_kwargs,
+                all_relevant_adus_df,
+                all_relevant_adus_query_doc_id,
+            ],
             outputs=rendered_output,
         )
             inputs=[retriever_state],
             outputs=[processed_documents_df],
         )
+        show_stats_kwargs = dict(
+            fn=lambda _processed_documents_df: open_accordion_with_stats(
+                _processed_documents_df,
+                base_label=indexed_documents_label,
+                caption2column=indexed_documents_caption2column,
+                total_column="TOTAL",
+            ),
+            inputs=[processed_documents_df],
+            outputs=[processed_documents_accordion],
+        )
         predict_btn.click(
             fn=lambda: change_tab(analysed_document_tab.id), inputs=[], outputs=[left_tabs]
         ).then(
             api_name="predict",
         ).success(
             **show_overview_kwargs
+        ).success(
+            **show_stats_kwargs
         ).success(
             **render_event_kwargs
         )
             api_name="predict",
         ).success(
             **show_overview_kwargs
+        ).success(
+            **show_stats_kwargs
         )
         load_pie_dataset_btn.click(
             ),
             inputs=[retriever_state, load_pie_dataset_kwargs_str],
             outputs=[processed_documents_df],
+        ).success(
+            **show_stats_kwargs
         )
         selected_document_id.change(
                 file_names=_file_names,
                 argumentation_model=_argumentation_model[0],
                 retriever=_retriever[0],
+                split_regex_escaped=(
+                    unescape_regex(_split_regex_escaped) if _split_regex_escaped else None
+                ),
                 handle_parts_of_same=handle_parts_of_same,
                 layer_captions=layer_caption_mapping,
             ),
                 split_regex_escaped,
             ],
             outputs=[processed_documents_df],
+        ).success(
+            **show_stats_kwargs
+        )
+        upload_pdf_btn.upload(
+            fn=lambda: change_tab(retrieval_tab.id), inputs=[], outputs=[right_tabs]
+        ).then(fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]).then(
+            fn=lambda _file_names, _argumentation_model, _retriever, _split_regex_escaped: process_uploaded_pdf_files(
+                file_names=_file_names,
+                argumentation_model=_argumentation_model[0],
+                retriever=_retriever[0],
+                split_regex_escaped=(
+                    unescape_regex(_split_regex_escaped) if _split_regex_escaped else None
+                ),
+                handle_parts_of_same=handle_parts_of_same,
+                layer_captions=layer_caption_mapping,
+                pdf_fulltext_extractor=pdf_fulltext_extractor,
+            ),
+            inputs=[
+                upload_pdf_btn,
+                argumentation_model_state,
+                retriever_state,
+                split_regex_escaped,
+            ],
+            outputs=[processed_documents_df],
+        ).success(
+            **show_stats_kwargs
+        )
+        load_acl_anthology_venues_btn.click(
+            fn=lambda: change_tab(retrieval_tab.id), inputs=[], outputs=[right_tabs]
+        ).then(fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]).then(
+            fn=lambda _acl_anthology_venues, _argumentation_model, _retriever, _split_regex_escaped: load_acl_anthology_venues(
+                pdf_fulltext_extractor=pdf_fulltext_extractor,
+                venues=[venue.strip() for venue in _acl_anthology_venues.split(",")],
+                argumentation_model=_argumentation_model[0],
+                retriever=_retriever[0],
+                split_regex_escaped=(
+                    unescape_regex(_split_regex_escaped) if _split_regex_escaped else None
+                ),
+                handle_parts_of_same=handle_parts_of_same,
+                layer_captions=layer_caption_mapping,
+                acl_anthology_data_dir=cfg.get("acl_anthology_data_dir"),
+                pdf_output_dir=cfg.get("acl_anthology_pdf_dir"),
+            ),
+            inputs=[
+                acl_anthology_venues,
+                argumentation_model_state,
+                retriever_state,
+                split_regex_escaped,
+            ],
+            outputs=[processed_documents_df],
+        ).success(
+            **show_stats_kwargs
         )
         processed_documents_df.select(
             fn=get_cell_for_fixed_column_from_df,
             inputs=[processed_documents_df, gr.State("doc_id")],
             ),
             inputs=[upload_processed_documents_btn, retriever_state],
             outputs=[processed_documents_df],
+        ).success(**show_stats_kwargs)
         retrieve_relevant_adus_event_kwargs = dict(
             fn=lambda _retriever, _selected_adu_id, _min_similarity, _top_k: retrieve_relevant_spans(
         )
         retrieve_all_relevant_adus_btn.click(
+            fn=lambda _retriever, _document_id, _min_similarity, _tok_k: (
+                retrieve_all_relevant_spans(
+                    retriever=_retriever[0],
+                    query_doc_id=_document_id,
+                    k=_tok_k,
+                    score_threshold=_min_similarity,
+                    query_span_id_column="query_span_id",
+                    query_span_text_column="query_span_text",
+                ),
+                _document_id,
             ),
             inputs=[
                 retriever_state,
                 min_similarity,
                 top_k,
             ],
+            outputs=[all_relevant_adus_df, all_relevant_adus_query_doc_id],
         )
+        all_relevant_adus_df.change(**render_event_kwargs)
         # select query span id from the "retrieve all" result data frames
         all_similar_adus_df.select(
             fn=get_cell_for_fixed_column_from_df,

src/train.py CHANGED Viewed

@@ -220,6 +220,7 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
     train_metrics = trainer.callback_metrics
     best_ckpt_path = trainer.checkpoint_callback.best_model_path
     if best_ckpt_path != "":
         log.info(f"Best ckpt path: {best_ckpt_path}")
         best_checkpoint_file = os.path.basename(best_ckpt_path)
@@ -228,6 +229,14 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
             best_checkpoint=best_checkpoint_file,
             checkpoint_dir=trainer.checkpoint_callback.dirpath,
         )
     if not cfg.trainer.get("fast_dev_run"):
         if cfg.model_save_dir is not None:
@@ -259,6 +268,7 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
         trainer.test(model=model, datamodule=datamodule, ckpt_path=best_ckpt_path or None)
     test_metrics = trainer.callback_metrics
     # merge train and test metrics
     metric_dict = {**train_metrics, **test_metrics}

     train_metrics = trainer.callback_metrics
     best_ckpt_path = trainer.checkpoint_callback.best_model_path
+    best_epoch = None
     if best_ckpt_path != "":
         log.info(f"Best ckpt path: {best_ckpt_path}")
         best_checkpoint_file = os.path.basename(best_ckpt_path)
             best_checkpoint=best_checkpoint_file,
             checkpoint_dir=trainer.checkpoint_callback.dirpath,
         )
+        # get epoch from best_checkpoint_file (e.g. "epoch_078.ckpt")
+        try:
+            best_epoch = int(os.path.splitext(best_checkpoint_file)[0].split("_")[-1])
+        except Exception as e:
+            log.warning(
+                f'Could not retrieve epoch from best checkpoint file name: "{e}". '
+                f"Expected format: " + '"epoch_{best_epoch}.ckpt"'
+            )
     if not cfg.trainer.get("fast_dev_run"):
         if cfg.model_save_dir is not None:
         trainer.test(model=model, datamodule=datamodule, ckpt_path=best_ckpt_path or None)
     test_metrics = trainer.callback_metrics
+    test_metrics["best_epoch"] = best_epoch
     # merge train and test metrics
     metric_dict = {**train_metrics, **test_metrics}

src/utils/__init__.py CHANGED Viewed

@@ -1,4 +1,9 @@
-from .config_utils import execute_pipeline, instantiate_dict_entries, prepare_omegaconf
 from .data_utils import download_and_unzip, filter_dataframe_and_get_column
 from .logging_utils import close_loggers, get_pylogger, log_hyperparameters
 from .rich_utils import enforce_tags, print_config_tree

+from .config_utils import (
+    execute_pipeline,
+    instantiate_dict_entries,
+    parse_config,
+    prepare_omegaconf,
+)
 from .data_utils import download_and_unzip, filter_dataframe_and_get_column
 from .logging_utils import close_loggers, get_pylogger, log_hyperparameters
 from .rich_utils import enforce_tags, print_config_tree

src/utils/config_utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from copy import copy
-from typing import Any, List, Optional
 from hydra.utils import instantiate
 from omegaconf import DictConfig, OmegaConf
@@ -69,3 +69,17 @@ def prepare_omegaconf():
         OmegaConf.register_new_resolver("replace", lambda s, x, y: s.replace(x, y))
     else:
         logger.warning("OmegaConf resolver 'replace' is already registered")

 from copy import copy
+from typing import Any, Dict, List, Optional
 from hydra.utils import instantiate
 from omegaconf import DictConfig, OmegaConf
         OmegaConf.register_new_resolver("replace", lambda s, x, y: s.replace(x, y))
     else:
         logger.warning("OmegaConf resolver 'replace' is already registered")
+def parse_config(config_string: str, format: str) -> Dict[str, Any]:
+    """Parse a configuration string."""
+    if format == "json":
+        import json
+        return json.loads(config_string)
+    elif format == "yaml":
+        import yaml
+        return yaml.safe_load(config_string)
+    else:
+        raise ValueError(f"Unsupported format: {format}. Use 'json' or 'yaml'.")

src/utils/pdf_utils/README.MD ADDED Viewed

	@@ -0,0 +1,35 @@

+# Generate paper json files from a collection xml file, with fulltext extraction.
+This is a slightly re-arranged version of Sotaro Takeshita's code, which is available at https://github.com/gengo-proj/data-factory.
+## Requirements
+- Docker
+- Python>=3.10
+- python packages:
+  - acl-anthology-py>=0.4.3
+  - bs4
+  - jsonschema
+## Setup
+Start Grobid Docker container
+```bash
+docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
+```
+Get the meta data from ACL Anthology
+```bash
+git clone git@github.com:acl-org/acl-anthology.git
+```
+## Usage
+```bash
+python src/data/acl_anthology_crawler.py \
+  --base-output-dir <path/to/save/raw-paper.json> \
+  --pdf-output-dir <path/to/save/downloaded/paper.pdf> \
+  --anthology-data-dir ./acl-anthology/data/
+```

src/utils/pdf_utils/__init__.py ADDED Viewed

File without changes

src/utils/pdf_utils/acl_anthology_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from dataclasses import dataclass
+from typing import Iterator
+from acl_anthology import Anthology
+from .process_pdf import paper_url_to_uuid
+from .raw_paper import RawPaper
+@dataclass
+class XML2RawPapers:
+    anthology: Anthology
+    collection_id_filters: list[str] | None = None
+    venue_id_whitelist: list[str] | None = None
+    verbose: bool = True
+    def __call__(self, *args, **kwargs) -> Iterator[RawPaper]:
+        for collection_id, collection in self.anthology.collections.items():
+            if self.collection_id_filters is not None:
+                if not any(
+                    [
+                        collection_id.find(filter_str) != -1
+                        for filter_str in self.collection_id_filters
+                    ]
+                ):
+                    continue
+            if self.verbose:
+                print(f"Processing collection: {collection_id}")
+            for volume in collection.volumes():
+                if self.venue_id_whitelist is not None:
+                    if not any(
+                        [venue_id in volume.venue_ids for venue_id in self.venue_id_whitelist]
+                    ):
+                        continue
+                volume_id = f"{collection_id}-{volume.id}"
+                for paper in volume.papers():
+                    fulltext, abstract = None, None
+                    if (
+                        paper.pdf is not None
+                        and paper.pdf.name is not None
+                        and paper.pdf.name.find("http") == -1
+                    ):
+                        name = paper.pdf.name
+                    else:
+                        name = (
+                            f"{volume_id}.{paper.id.rjust(3, '0')}"
+                            if len(collection_id) == 1
+                            else f"{volume_id}.{paper.id}"
+                        )
+                    paper_uuid = paper_url_to_uuid(name)
+                    raw_paper = RawPaper(
+                        paper_uuid=str(paper_uuid),
+                        name=name,
+                        collection_id=collection_id,
+                        collection_acronym=volume.venues()[0].acronym,
+                        volume_id=volume_id,
+                        booktitle=volume.title.as_text(),
+                        paper_id=int(paper.id),
+                        year=int(paper.year),
+                        paper_title=paper.title.as_text(),
+                        authors=[
+                            {"first": author.first, "last": author.last}
+                            for author in paper.authors
+                        ],
+                        abstract=(
+                            paper.abstract.as_text() if paper.abstract is not None else abstract
+                        ),
+                        url=paper.pdf.url if paper.pdf is not None else None,
+                        bibkey=paper.bibkey if paper.bibkey is not None else None,
+                        doi=paper.doi if paper.doi is not None else None,
+                        fulltext=fulltext,
+                    )
+                    yield raw_paper

src/utils/pdf_utils/client.py ADDED Viewed

	@@ -0,0 +1,193 @@

+""" Generic API Client """
+import json
+from copy import deepcopy
+import requests
+try:
+    from urlparse import urljoin
+except ImportError:
+    from urllib.parse import urljoin
+class ApiClient(object):
+    """Client to interact with a generic Rest API.
+    Subclasses should implement functionality accordingly with the provided
+    service methods, i.e. ``get``, ``post``, ``put`` and ``delete``.
+    """
+    accept_type = "application/xml"
+    api_base = None
+    def __init__(self, base_url, username=None, api_key=None, status_endpoint=None, timeout=60):
+        """Initialise client.
+        Args:
+            base_url (str): The base URL to the service being used.
+            username (str): The username to authenticate with.
+            api_key (str): The API key to authenticate with.
+            timeout (int): Maximum time before timing out.
+        """
+        self.base_url = base_url
+        self.username = username
+        self.api_key = api_key
+        self.status_endpoint = urljoin(self.base_url, status_endpoint)
+        self.timeout = timeout
+    @staticmethod
+    def encode(request, data):
+        """Add request content data to request body, set Content-type header.
+        Should be overridden by subclasses if not using JSON encoding.
+        Args:
+            request (HTTPRequest): The request object.
+            data (dict, None): Data to be encoded.
+        Returns:
+            HTTPRequest: The request object.
+        """
+        if data is None:
+            return request
+        request.add_header("Content-Type", "application/json")
+        request.data = json.dumps(data)
+        return request
+    @staticmethod
+    def decode(response):
+        """Decode the returned data in the response.
+        Should be overridden by subclasses if something else than JSON is
+        expected.
+        Args:
+            response (HTTPResponse): The response object.
+        Returns:
+            dict or None.
+        """
+        try:
+            return response.json()
+        except ValueError as e:
+            return e.message
+    def get_credentials(self):
+        """Returns parameters to be added to authenticate the request.
+        This lives on its own to make it easier to re-implement it if needed.
+        Returns:
+            dict: A dictionary containing the credentials.
+        """
+        return {"username": self.username, "api_key": self.api_key}
+    def call_api(
+        self,
+        method,
+        url,
+        headers=None,
+        params=None,
+        data=None,
+        files=None,
+        timeout=None,
+    ):
+        """Call API.
+        This returns object containing data, with error details if applicable.
+        Args:
+            method (str): The HTTP method to use.
+            url (str): Resource location relative to the base URL.
+            headers (dict or None): Extra request headers to set.
+            params (dict or None): Query-string parameters.
+            data (dict or None): Request body contents for POST or PUT requests.
+            files (dict or None: Files to be passed to the request.
+            timeout (int): Maximum time before timing out.
+        Returns:
+            ResultParser or ErrorParser.
+        """
+        headers = deepcopy(headers) or {}
+        headers["Accept"] = self.accept_type
+        params = deepcopy(params) or {}
+        data = data or {}
+        files = files or {}
+        # if self.username is not None and self.api_key is not None:
+        #    params.update(self.get_credentials())
+        r = requests.request(
+            method,
+            url,
+            headers=headers,
+            params=params,
+            files=files,
+            data=data,
+            timeout=timeout,
+        )
+        return r, r.status_code
+    def get(self, url, params=None, **kwargs):
+        """Call the API with a GET request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+        Returns:
+            ResultParser or ErrorParser.
+        """
+        return self.call_api("GET", url, params=params, **kwargs)
+    def delete(self, url, params=None, **kwargs):
+        """Call the API with a DELETE request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+        Returns:
+            ResultParser or ErrorParser.
+        """
+        return self.call_api("DELETE", url, params=params, **kwargs)
+    def put(self, url, params=None, data=None, files=None, **kwargs):
+        """Call the API with a PUT request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+            data (dict or None): Request body contents.
+            files (dict or None: Files to be passed to the request.
+        Returns:
+            An instance of ResultParser or ErrorParser.
+        """
+        return self.call_api("PUT", url, params=params, data=data, files=files, **kwargs)
+    def post(self, url, params=None, data=None, files=None, **kwargs):
+        """Call the API with a POST request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+            data (dict or None): Request body contents.
+            files (dict or None: Files to be passed to the request.
+        Returns:
+            An instance of ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            method="POST", url=url, params=params, data=data, files=files, **kwargs
+        )
+    def service_status(self, **kwargs):
+        """Call the API to get the status of the service.
+        Returns:
+            An instance of ResultParser or ErrorParser.
+        """
+        return self.call_api("GET", self.status_endpoint, params={"format": "json"}, **kwargs)

src/utils/pdf_utils/grobid_client.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import glob
+import io
+import ntpath
+import os
+import time
+from typing import List, Optional
+from .client import ApiClient
+# This version uses the standard ProcessPoolExecutor for parallelizing the concurrent calls to the GROBID services.
+# Given the limits of ThreadPoolExecutor (input stored in memory, blocking Executor.map until the whole input
+# is acquired), it works with batches of PDF of a size indicated in the config.json file (default is 1000 entries).
+# We are moving from first batch to the second one only when the first is entirely processed - which means it is
+# slightly sub-optimal, but should scale better. However acquiring a list of million of files in directories would
+# require something scalable too, which is not implemented for the moment.
+DEFAULT_GROBID_CONFIG = {
+    "grobid_server": "localhost",
+    "grobid_port": "8070",
+    "batch_size": 1000,
+    "sleep_time": 5,
+    "generateIDs": False,
+    "consolidate_header": False,
+    "consolidate_citations": False,
+    "include_raw_citations": True,
+    "include_raw_affiliations": False,
+    "max_workers": 2,
+}
+class GrobidClient(ApiClient):
+    def __init__(self, config=None):
+        self.config = config or DEFAULT_GROBID_CONFIG
+        self.generate_ids = self.config["generateIDs"]
+        self.consolidate_header = self.config["consolidate_header"]
+        self.consolidate_citations = self.config["consolidate_citations"]
+        self.include_raw_citations = self.config["include_raw_citations"]
+        self.include_raw_affiliations = self.config["include_raw_affiliations"]
+        self.max_workers = self.config["max_workers"]
+        self.grobid_server = self.config["grobid_server"]
+        self.grobid_port = str(self.config["grobid_port"])
+        self.sleep_time = self.config["sleep_time"]
+    def process(self, input: str, output: str, service: str):
+        batch_size_pdf = self.config["batch_size"]
+        pdf_files = []
+        for pdf_file in glob.glob(input + "/*.pdf"):
+            pdf_files.append(pdf_file)
+            if len(pdf_files) == batch_size_pdf:
+                self.process_batch(pdf_files, output, service)
+                pdf_files = []
+        # last batch
+        if len(pdf_files) > 0:
+            self.process_batch(pdf_files, output, service)
+    def process_batch(self, pdf_files: List[str], output: str, service: str) -> None:
+        print(len(pdf_files), "PDF files to process")
+        for pdf_file in pdf_files:
+            self.process_pdf(pdf_file, output, service)
+    def process_pdf_stream(self, pdf_file: str, pdf_strm: bytes, output: str, service: str) -> str:
+        # process the stream
+        files = {"input": (pdf_file, pdf_strm, "application/pdf", {"Expires": "0"})}
+        the_url = "http://" + self.grobid_server
+        the_url += ":" + self.grobid_port
+        the_url += "/api/" + service
+        # set the GROBID parameters
+        the_data = {}
+        if self.generate_ids:
+            the_data["generateIDs"] = "1"
+        else:
+            the_data["generateIDs"] = "0"
+        if self.consolidate_header:
+            the_data["consolidateHeader"] = "1"
+        else:
+            the_data["consolidateHeader"] = "0"
+        if self.consolidate_citations:
+            the_data["consolidateCitations"] = "1"
+        else:
+            the_data["consolidateCitations"] = "0"
+        if self.include_raw_affiliations:
+            the_data["includeRawAffiliations"] = "1"
+        else:
+            the_data["includeRawAffiliations"] = "0"
+        if self.include_raw_citations:
+            the_data["includeRawCitations"] = "1"
+        else:
+            the_data["includeRawCitations"] = "0"
+        res, status = self.post(
+            url=the_url, files=files, data=the_data, headers={"Accept": "text/plain"}
+        )
+        if status == 503:
+            time.sleep(self.sleep_time)
+            # TODO: check if simply passing output as output is correct
+            return self.process_pdf_stream(
+                pdf_file=pdf_file, pdf_strm=pdf_strm, service=service, output=output
+            )
+        elif status != 200:
+            with open(os.path.join(output, "failed.log"), "a+") as failed:
+                failed.write(pdf_file.strip(".pdf") + "\n")
+            print("Processing failed with error " + str(status))
+            return ""
+        else:
+            return res.text
+    def process_pdf(self, pdf_file: str, output: str, service: str) -> None:
+        # check if TEI file is already produced
+        # we use ntpath here to be sure it will work on Windows too
+        pdf_file_name = ntpath.basename(pdf_file)
+        filename = os.path.join(output, os.path.splitext(pdf_file_name)[0] + ".tei.xml")
+        if os.path.isfile(filename):
+            return
+        print(pdf_file)
+        pdf_strm = open(pdf_file, "rb").read()
+        tei_text = self.process_pdf_stream(pdf_file, pdf_strm, output, service)
+        # writing TEI file
+        if tei_text:
+            with io.open(filename, "w+", encoding="utf8") as tei_file:
+                tei_file.write(tei_text)
+    def process_citation(self, bib_string: str, log_file: str) -> Optional[str]:
+        # process citation raw string and return corresponding dict
+        the_data = {"citations": bib_string, "consolidateCitations": "0"}
+        the_url = "http://" + self.grobid_server
+        the_url += ":" + self.grobid_port
+        the_url += "/api/processCitation"
+        for _ in range(5):
+            try:
+                res, status = self.post(
+                    url=the_url, data=the_data, headers={"Accept": "text/plain"}
+                )
+                if status == 503:
+                    time.sleep(self.sleep_time)
+                    continue
+                elif status != 200:
+                    with open(log_file, "a+") as failed:
+                        failed.write("-- BIBSTR --\n")
+                        failed.write(bib_string + "\n\n")
+                    break
+                else:
+                    return res.text
+            except Exception:
+                continue
+        return None
+    def process_header_names(self, header_string: str, log_file: str) -> Optional[str]:
+        # process author names from header string
+        the_data = {"names": header_string}
+        the_url = "http://" + self.grobid_server
+        the_url += ":" + self.grobid_port
+        the_url += "/api/processHeaderNames"
+        res, status = self.post(url=the_url, data=the_data, headers={"Accept": "text/plain"})
+        if status == 503:
+            time.sleep(self.sleep_time)
+            return self.process_header_names(header_string, log_file)
+        elif status != 200:
+            with open(log_file, "a+") as failed:
+                failed.write("-- AUTHOR --\n")
+                failed.write(header_string + "\n\n")
+        else:
+            return res.text
+        return None
+    def process_affiliations(self, aff_string: str, log_file: str) -> Optional[str]:
+        # process affiliation from input string
+        the_data = {"affiliations": aff_string}
+        the_url = "http://" + self.grobid_server
+        the_url += ":" + self.grobid_port
+        the_url += "/api/processAffiliations"
+        res, status = self.post(url=the_url, data=the_data, headers={"Accept": "text/plain"})
+        if status == 503:
+            time.sleep(self.sleep_time)
+            return self.process_affiliations(aff_string, log_file)
+        elif status != 200:
+            with open(log_file, "a+") as failed:
+                failed.write("-- AFFILIATION --\n")
+                failed.write(aff_string + "\n\n")
+        else:
+            return res.text
+        return None

src/utils/pdf_utils/grobid_util.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import re
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+import bs4
+from bs4 import BeautifulSoup
+SUBSTITUTE_TAGS = {"persName", "orgName", "publicationStmt", "titleStmt", "biblScope"}
+def clean_tags(el: bs4.element.Tag):
+    """
+    Replace all tags with lowercase version
+    :param el:
+    :return:
+    """
+    for sub_tag in SUBSTITUTE_TAGS:
+        for sub_el in el.find_all(sub_tag):
+            sub_el.name = sub_tag.lower()
+def soup_from_path(file_path: str):
+    """
+    Read XML file
+    :param file_path:
+    :return:
+    """
+    return BeautifulSoup(open(file_path, "rb").read(), "xml")
+def get_title_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns title
+    :return:
+    """
+    for title_entry in raw_xml.find_all("title"):
+        if title_entry.has_attr("level") and title_entry["level"] == "a":
+            return title_entry.text
+    try:
+        return raw_xml.title.text
+    except AttributeError:
+        return ""
+def get_author_names_from_grobid_xml(
+    raw_xml: BeautifulSoup,
+) -> List[Dict[str, Union[str, List[str]]]]:
+    """
+    Returns a list of dictionaries, one for each author,
+    containing the first and last names.
+    e.g.
+        {
+            "first": first,
+            "middle": middle,
+            "last": last,
+            "suffix": suffix
+        }
+    """
+    names = []
+    for author in raw_xml.find_all("author"):
+        if not author.persname:
+            continue
+        # forenames include first and middle names
+        forenames = author.persname.find_all("forename")
+        # surnames include last names
+        surnames = author.persname.find_all("surname")
+        # name suffixes
+        suffixes = author.persname.find_all("suffix")
+        first = ""
+        middle = []
+        last = ""
+        suffix = ""
+        for forename in forenames:
+            if forename["type"] == "first":
+                if not first:
+                    first = forename.text
+                else:
+                    middle.append(forename.text)
+            elif forename["type"] == "middle":
+                middle.append(forename.text)
+        if len(surnames) > 1:
+            for surname in surnames[:-1]:
+                middle.append(surname.text)
+            last = surnames[-1].text
+        elif len(surnames) == 1:
+            last = surnames[0].text
+        if len(suffix) >= 1:
+            suffix = " ".join([suff.text for suff in suffixes])
+        names_dict: Dict[str, Union[str, List[str]]] = {
+            "first": first,
+            "middle": middle,
+            "last": last,
+            "suffix": suffix,
+        }
+        names.append(names_dict)
+    return names
+def get_affiliation_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict:
+    """
+    Get affiliation from grobid xml
+    :param raw_xml:
+    :return:
+    """
+    location_dict = dict()
+    laboratory_name = ""
+    institution_name = ""
+    if raw_xml and raw_xml.affiliation:
+        for child in raw_xml.affiliation:
+            if child.name == "orgname":
+                if child.has_attr("type"):
+                    if child["type"] == "laboratory":
+                        laboratory_name = child.text
+                    elif child["type"] == "institution":
+                        institution_name = child.text
+            elif child.name == "address":
+                for grandchild in child:
+                    if grandchild.name and grandchild.text:
+                        location_dict[grandchild.name] = grandchild.text
+        if laboratory_name or institution_name:
+            return {
+                "laboratory": laboratory_name,
+                "institution": institution_name,
+                "location": location_dict,
+            }
+    return {}
+def get_author_data_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict]:
+    """
+    Returns a list of dictionaries, one for each author,
+    containing the first and last names.
+    e.g.
+        {
+            "first": first,
+            "middle": middle,
+            "last": last,
+            "suffix": suffix,
+            "affiliation": {
+                "laboratory": "",
+                "institution": "",
+                "location": "",
+            },
+            "email": ""
+        }
+    """
+    authors = []
+    for author in raw_xml.find_all("author"):
+        first = ""
+        middle = []
+        last = ""
+        suffix = ""
+        if author.persname:
+            # forenames include first and middle names
+            forenames = author.persname.find_all("forename")
+            # surnames include last names
+            surnames = author.persname.find_all("surname")
+            # name suffixes
+            suffixes = author.persname.find_all("suffix")
+            for forename in forenames:
+                if forename.has_attr("type"):
+                    if forename["type"] == "first":
+                        if not first:
+                            first = forename.text
+                        else:
+                            middle.append(forename.text)
+                    elif forename["type"] == "middle":
+                        middle.append(forename.text)
+            if len(surnames) > 1:
+                for surname in surnames[:-1]:
+                    middle.append(surname.text)
+                last = surnames[-1].text
+            elif len(surnames) == 1:
+                last = surnames[0].text
+            if len(suffix) >= 1:
+                suffix = " ".join([suffix.text for suffix in suffixes])
+        affiliation = get_affiliation_from_grobid_xml(author)
+        email = ""
+        if author.email:
+            email = author.email.text
+        author_dict = {
+            "first": first,
+            "middle": middle,
+            "last": last,
+            "suffix": suffix,
+            "affiliation": affiliation,
+            "email": email,
+        }
+        authors.append(author_dict)
+    return authors
+def get_year_from_grobid_xml(raw_xml: BeautifulSoup) -> Optional[int]:
+    """
+    Returns date published if exists
+    :return:
+    """
+    if raw_xml.date and raw_xml.date.has_attr("when"):
+        # match year in date text (which is in some unspecified date format)
+        year_match = re.match(r"((19|20)\d{2})", raw_xml.date["when"])
+        if year_match:
+            year = year_match.group(0)
+            if year and year.isnumeric() and len(year) == 4:
+                return int(year)
+    return None
+def get_venue_from_grobid_xml(raw_xml: BeautifulSoup, title_text: str) -> str:
+    """
+    Returns venue/journal/publisher of bib entry
+    Grobid ref documentation: https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/
+    level="j": journal title
+    level="m": "non journal bibliographical item holding the cited article"
+    level="s": series title
+    :return:
+    """
+    title_names = []
+    keep_types = ["j", "m", "s"]
+    # get all titles of the anove types
+    for title_entry in raw_xml.find_all("title"):
+        if (
+            title_entry.has_attr("level")
+            and title_entry["level"] in keep_types
+            and title_entry.text != title_text
+        ):
+            title_names.append((title_entry["level"], title_entry.text))
+    # return the title name that most likely belongs to the journal or publication venue
+    if title_names:
+        title_names.sort(key=lambda x: keep_types.index(x[0]))
+        return title_names[0][1]
+    return ""
+def get_volume_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns the volume number of grobid bib entry
+    Grobid <biblscope unit="volume">
+    :return:
+    """
+    for bibl_entry in raw_xml.find_all("biblscope"):
+        if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "volume":
+            return bibl_entry.text
+    return ""
+def get_issue_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns the issue number of grobid bib entry
+    Grobid <biblscope unit="issue">
+    :return:
+    """
+    for bibl_entry in raw_xml.find_all("biblscope"):
+        if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "issue":
+            return bibl_entry.text
+    return ""
+def get_pages_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns the page numbers of grobid bib entry
+    Grobid <biblscope unit="page">
+    :return:
+    """
+    for bibl_entry in raw_xml.find_all("biblscope"):
+        if (
+            bibl_entry.has_attr("unit")
+            and bibl_entry["unit"] == "page"
+            and bibl_entry.has_attr("from")
+        ):
+            from_page = bibl_entry["from"]
+            if bibl_entry.has_attr("to"):
+                to_page = bibl_entry["to"]
+                return f"{from_page}--{to_page}"
+            else:
+                return from_page
+    return ""
+def get_other_ids_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict[str, List]:
+    """
+    Returns a dictionary of other identifiers from grobid bib entry (arxiv, pubmed, doi)
+    :param raw_xml:
+    :return:
+    """
+    other_ids = defaultdict(list)
+    for idno_entry in raw_xml.find_all("idno"):
+        if idno_entry.has_attr("type") and idno_entry.text:
+            other_ids[idno_entry["type"]].append(idno_entry.text)
+    return other_ids
+def get_raw_bib_text_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns the raw bibiliography string
+    :param raw_xml:
+    :return:
+    """
+    for note in raw_xml.find_all("note"):
+        if note.has_attr("type") and note["type"] == "raw_reference":
+            return note.text
+    return ""
+def get_publication_datetime_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Finds and returns the publication datetime if it exists
+    :param raw_xml:
+    :return:
+    """
+    if raw_xml.publicationStmt:
+        for child in raw_xml.publicationstmt:
+            if (
+                child.name == "date"
+                and child.has_attr("type")
+                and child["type"] == "published"
+                and child.has_attr("when")
+            ):
+                return child["when"]
+    return ""
+def parse_bib_entry(bib_entry: BeautifulSoup) -> Dict:
+    """
+    Parse one bib entry
+    :param bib_entry:
+    :return:
+    """
+    clean_tags(bib_entry)
+    title = get_title_from_grobid_xml(bib_entry)
+    return {
+        "ref_id": bib_entry.attrs.get("xml:id", None),
+        "title": title,
+        "authors": get_author_names_from_grobid_xml(bib_entry),
+        "year": get_year_from_grobid_xml(bib_entry),
+        "venue": get_venue_from_grobid_xml(bib_entry, title),
+        "volume": get_volume_from_grobid_xml(bib_entry),
+        "issue": get_issue_from_grobid_xml(bib_entry),
+        "pages": get_pages_from_grobid_xml(bib_entry),
+        "other_ids": get_other_ids_from_grobid_xml(bib_entry),
+        "raw_text": get_raw_bib_text_from_grobid_xml(bib_entry),
+        "urls": [],
+    }
+def is_reference_tag(tag: bs4.element.Tag) -> bool:
+    return tag.name == "ref" and tag.attrs.get("type", "") == "bibr"
+def extract_paper_metadata_from_grobid_xml(tag: bs4.element.Tag) -> Dict:
+    """
+    Extract paper metadata (title, authors, affiliation, year) from grobid xml
+    :param tag:
+    :return:
+    """
+    clean_tags(tag)
+    paper_metadata = {
+        "title": tag.titlestmt.title.text,
+        "authors": get_author_data_from_grobid_xml(tag),
+        "year": get_publication_datetime_from_grobid_xml(tag),
+    }
+    return paper_metadata
+def parse_bibliography(soup: BeautifulSoup) -> List[Dict]:
+    """
+    Finds all bibliography entries in a grobid xml.
+    """
+    bibliography = soup.listBibl
+    if bibliography is None:
+        return []
+    entries = bibliography.find_all("biblStruct")
+    structured_entries = []
+    for entry in entries:
+        bib_entry = parse_bib_entry(entry)
+        # add bib entry only if it has a title
+        if bib_entry["title"]:
+            structured_entries.append(bib_entry)
+    bibliography.decompose()
+    return structured_entries

src/utils/pdf_utils/process_pdf.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import json
+import os
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Optional
+import requests
+from bs4 import BeautifulSoup
+from .grobid_client import GrobidClient
+from .grobid_util import extract_paper_metadata_from_grobid_xml, parse_bibliography
+from .s2orc_paper import Paper
+from .utils import (
+    _clean_empty_and_duplicate_authors_from_grobid_parse,
+    check_if_citations_are_bracket_style,
+    extract_abstract_from_tei_xml,
+    extract_back_matter_from_tei_xml,
+    extract_body_text_from_tei_xml,
+    extract_figures_and_tables_from_tei_xml,
+    normalize_grobid_id,
+    sub_all_note_tags,
+)
+BASE_TEMP_DIR = "./grobid/temp"
+BASE_OUTPUT_DIR = "./grobid/output"
+BASE_LOG_DIR = "./grobid/log"
+def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper:
+    """
+    Convert Grobid TEI XML to S2ORC json format
+    :param soup: BeautifulSoup of XML file content
+    :param paper_id: name of file
+    :param pdf_hash: hash of PDF
+    :return:
+    """
+    # extract metadata
+    metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc)
+    # clean metadata authors (remove dupes etc)
+    metadata["authors"] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata["authors"])
+    # parse bibliography entries (removes empty bib entries)
+    biblio_entries = parse_bibliography(soup)
+    bibkey_map = {normalize_grobid_id(bib["ref_id"]): bib for bib in biblio_entries}
+    # # process formulas and replace with text
+    # extract_formulas_from_tei_xml(soup)
+    # extract figure and table captions
+    refkey_map = extract_figures_and_tables_from_tei_xml(soup)
+    # get bracket style
+    is_bracket_style = check_if_citations_are_bracket_style(soup)
+    # substitute all note tags with p tags
+    soup = sub_all_note_tags(soup)
+    # process abstract if possible
+    abstract_entries = extract_abstract_from_tei_xml(
+        soup, bibkey_map, refkey_map, is_bracket_style
+    )
+    # process body text
+    body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
+    # parse back matter (acks, author statements, competing interests, abbrevs etc)
+    back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
+    # form final paper entry
+    return Paper(
+        paper_id=paper_id,
+        pdf_hash=pdf_hash,
+        metadata=metadata,
+        abstract=abstract_entries,
+        body_text=body_entries,
+        back_matter=back_matter,
+        bib_entries=bibkey_map,
+        ref_entries=refkey_map,
+    )
+def convert_tei_xml_file_to_s2orc_json(tei_file: str, pdf_hash: str = "") -> Paper:
+    """
+    Convert a TEI XML file to S2ORC JSON
+    :param tei_file:
+    :param pdf_hash:
+    :return:
+    """
+    if not os.path.exists(tei_file):
+        raise FileNotFoundError("Input TEI XML file doesn't exist")
+    paper_id = tei_file.split("/")[-1].split(".")[0]
+    soup = BeautifulSoup(open(tei_file, "rb").read(), "xml")
+    paper = convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash)
+    return paper
+def process_pdf_stream(
+    input_file: str, sha: str, input_stream: bytes, grobid_config: Optional[Dict] = None
+) -> Dict:
+    """
+    Process PDF stream
+    :param input_file:
+    :param sha:
+    :param input_stream:
+    :return:
+    """
+    # process PDF through Grobid -> TEI.XML
+    client = GrobidClient(grobid_config)
+    tei_text = client.process_pdf_stream(
+        input_file, input_stream, "temp", "processFulltextDocument"
+    )
+    # make soup
+    soup = BeautifulSoup(tei_text, "xml")
+    # get paper
+    paper = convert_tei_xml_soup_to_s2orc_json(soup, input_file, sha)
+    return paper.release_json("pdf")
+def process_pdf_file(
+    input_file: str,
+    temp_dir: str = BASE_TEMP_DIR,
+    output_dir: str = BASE_OUTPUT_DIR,
+    grobid_config: Optional[Dict] = None,
+    verbose: bool = True,
+) -> str:
+    """
+    Process a PDF file and get JSON representation
+    :param input_file:
+    :param temp_dir:
+    :param output_dir:
+    :return:
+    """
+    os.makedirs(temp_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
+    # get paper id as the name of the file
+    paper_id = ".".join(input_file.split("/")[-1].split(".")[:-1])
+    tei_file = os.path.join(temp_dir, f"{paper_id}.tei.xml")
+    output_file = os.path.join(output_dir, f"{paper_id}.json")
+    # check if input file exists and output file doesn't
+    if not os.path.exists(input_file):
+        raise FileNotFoundError(f"{input_file} doesn't exist")
+    if os.path.exists(output_file):
+        if verbose:
+            print(f"{output_file} already exists!")
+        return output_file
+    # process PDF through Grobid -> TEI.XML
+    client = GrobidClient(grobid_config)
+    # TODO: compute PDF hash
+    # TODO: add grobid version number to output
+    client.process_pdf(input_file, temp_dir, "processFulltextDocument")
+    # process TEI.XML -> JSON
+    assert os.path.exists(tei_file)
+    paper = convert_tei_xml_file_to_s2orc_json(tei_file)
+    # write to file
+    with open(output_file, "w") as outf:
+        json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
+    return output_file
+UUID_NAMESPACE = uuid.UUID("bab08d37-ac12-40c4-847a-20ca337742fd")
+def paper_url_to_uuid(paper_url: str) -> "uuid.UUID":
+    return uuid.uuid5(UUID_NAMESPACE, paper_url)
+@dataclass
+class PDFDownloader:
+    verbose: bool = True
+    def download(self, url: str, opath: str | Path) -> Path:
+        """Download a pdf file from URL and save locally.
+        Skip if there is a file at `opath` already.
+        Parameters
+        ----------
+        url : str
+            URL of the target PDF file
+        opath : str
+            Path to save downloaded PDF data.
+        """
+        if os.path.exists(opath):
+            return Path(opath)
+        if not os.path.exists(os.path.dirname(opath)):
+            os.makedirs(os.path.dirname(opath), exist_ok=True)
+        if self.verbose:
+            print(f"Downloading {url} into {opath}")
+        with open(opath, "wb") as f:
+            res = requests.get(url)
+            f.write(res.content)
+        return Path(opath)
+@dataclass
+class FulltextExtractor:
+    def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None:
+        """Extract plain text from a PDf file"""
+        raise NotImplementedError
+@dataclass
+class GrobidFulltextExtractor(FulltextExtractor):
+    tmp_dir: str = "./tmp/grobid"
+    grobid_config: Optional[Dict] = None
+    section_seperator: str = "\n\n"
+    paragraph_seperator: str = "\n"
+    verbose: bool = True
+    def construct_plain_text(self, extraction_result: dict) -> str:
+        section_strings = []
+        # add the title, if available (consider it as the first section)
+        title = extraction_result.get("title")
+        if title and title.strip():
+            section_strings.append(title.strip())
+        section_paragraphs: dict[str, list[str]] = extraction_result["sections"]
+        section_strings.extend(
+            self.paragraph_seperator.join(
+                # consider the section title as the first paragraph and
+                # remove empty paragraphs
+                filter(lambda s: len(s) > 0, map(lambda s: s.strip(), [section_name] + paragraphs))
+            )
+            for section_name, paragraphs in section_paragraphs.items()
+        )
+        return self.section_seperator.join(section_strings)
+    def postprocess_extraction_result(self, extraction_result: dict) -> dict:
+        # add sections
+        sections: dict[str, list[str]] = {}
+        for body_text in extraction_result["pdf_parse"]["body_text"]:
+            section_name = body_text["section"]
+            if section_name not in sections.keys():
+                sections[section_name] = []
+            sections[section_name] += [body_text["text"]]
+        extraction_result = {**extraction_result, "sections": sections}
+        return extraction_result
+    def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None:
+        """Extract plain text from a PDf file"""
+        try:
+            extraction_fpath = process_pdf_file(
+                str(pdf_file_path),
+                temp_dir=self.tmp_dir,
+                output_dir=self.tmp_dir,
+                grobid_config=self.grobid_config,
+                verbose=self.verbose,
+            )
+            with open(extraction_fpath, "r") as f:
+                extraction_result = json.load(f)
+            processed_extraction_result = self.postprocess_extraction_result(extraction_result)
+            plain_text = self.construct_plain_text(processed_extraction_result)
+            return plain_text, extraction_result
+        except AssertionError:
+            print("Grobid failed to parse this document.")
+            return None

src/utils/pdf_utils/raw_paper.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import json
+import os
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+from jsonschema import validate
+# TODO: load from file
+schema = {
+    "title": "RawPaper",
+    "type": "object",
+    "properties": {
+        "paper_uuid": {"type": "string"},
+        "name": {"type": "string"},
+        "collection_id": {"type": "string"},
+        "collection_acronym": {"type": "string"},
+        "volume_id": {"type": "string"},
+        "booktitle": {"type": "string"},
+        "paper_id": {"type": "integer"},
+        "year": {"type": ["integer", "null"]},
+        "paper_title": {"type": "string"},
+        "authors": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "items": {
+                    "first": {"type": ["string", "null"]},
+                    "last": {"type": ["string", "null"]},
+                },
+            },
+        },
+        "abstract": {"type": ["string", "null"]},
+        "url": {"type": "string"},
+        "bibkey": {"type": ["string", "null"]},
+        "doi": {"type": ["string", "null"]},
+        "fulltext": {
+            "type": ["object", "null"],
+            "patternProperties": {"^.*$": {"type": "array", "items": {"type": "string"}}},
+        },
+    },
+}
+assert isinstance(schema, dict)
+@dataclass
+class RawPaper:
+    paper_uuid: str
+    name: str
+    collection_id: str
+    collection_acronym: str
+    volume_id: str
+    booktitle: str
+    paper_id: int
+    year: int | None
+    paper_title: str
+    authors: list[dict[str, str | None]]
+    abstract: str | None
+    url: str | None
+    bibkey: str
+    doi: str | None
+    fulltext: dict[str, list[str]] | None
+    @classmethod
+    def load_from_json(cls, fpath: str | Path) -> "RawPaper":
+        fpath = fpath if not isinstance(fpath, Path) else str(fpath)
+        # return cls(**sienna.load(fpath))
+        with open(fpath, "r") as f:
+            data = cls(**json.load(f))
+        return data
+    def get_fname(self) -> str:
+        return f"{self.name}.json"
+    def dumps(self) -> dict[str, Any]:
+        return asdict(self)
+    def validate(self) -> None:
+        validate(self.dumps(), schema=schema)
+    def save(self, odir: str) -> None:
+        self.validate()
+        if not os.path.exists(odir):
+            os.makedirs(odir, exist_ok=True)
+        opath = os.path.join(odir, self.get_fname())
+        with open(opath, "w") as f:
+            f.write(json.dumps(self.dumps(), indent=2))

src/utils/pdf_utils/s2orc_paper.py ADDED Viewed

	@@ -0,0 +1,478 @@

+from datetime import datetime
+from typing import Any, Dict, List, Optional
+S2ORC_NAME_STRING = "S2ORC"
+S2ORC_VERSION_STRING = "1.0.0"
+CORRECT_KEYS = {"issn": "issue", "type": "type_str"}
+SKIP_KEYS = {"link", "bib_id"}
+REFERENCE_OUTPUT_KEYS = {
+    "figure": {"text", "type_str", "uris", "num", "fig_num"},
+    "table": {"text", "type_str", "content", "num", "html"},
+    "footnote": {"text", "type_str", "num"},
+    "section": {"text", "type_str", "num", "parent"},
+    "equation": {"text", "type_str", "latex", "mathml", "num"},
+}
+METADATA_KEYS = {"title", "authors", "year", "venue", "identifiers"}
+class ReferenceEntry:
+    """
+    Class for representing S2ORC figure and table references
+    An example json representation (values are examples, not accurate):
+    {
+      "FIGREF0": {
+        "text": "FIG. 2. Depth profiles of...",
+        "latex": null,
+        "type": "figure"
+      },
+      "TABREF2": {
+        "text": "Diversity indices of...",
+        "latex": null,
+        "type": "table",
+        "content": "",
+        "html": ""
+      }
+    }
+    """
+    def __init__(
+        self,
+        ref_id: str,
+        text: str,
+        type_str: str,
+        latex: Optional[str] = None,
+        mathml: Optional[str] = None,
+        content: Optional[str] = None,
+        html: Optional[str] = None,
+        uris: Optional[List[str]] = None,
+        num: Optional[str] = None,
+        parent: Optional[str] = None,
+        fig_num: Optional[str] = None,
+    ):
+        self.ref_id = ref_id
+        self.text = text
+        self.type_str = type_str
+        self.latex = latex
+        self.mathml = mathml
+        self.content = content
+        self.html = html
+        self.uris = uris
+        self.num = num
+        self.parent = parent
+        self.fig_num = fig_num
+    def as_json(self):
+        keep_keys = REFERENCE_OUTPUT_KEYS.get(self.type_str, None)
+        if keep_keys:
+            return {k: self.__getattribute__(k) for k in keep_keys}
+        else:
+            return {
+                "text": self.text,
+                "type": self.type_str,
+                "latex": self.latex,
+                "mathml": self.mathml,
+                "content": self.content,
+                "html": self.html,
+                "uris": self.uris,
+                "num": self.num,
+                "parent": self.parent,
+                "fig_num": self.fig_num,
+            }
+class BibliographyEntry:
+    """
+    Class for representing S2ORC parsed bibliography entries
+    An example json representation (values are examples, not accurate):
+    {
+        "title": "Mobility Reports...",
+        "authors": [
+            {
+                "first": "A",
+                "middle": ["A"],
+                "last": "Haija",
+                "suffix": ""
+            }
+        ],
+        "year": 2015,
+        "venue": "IEEE Wireless Commune Mag",
+        "volume": "42",
+        "issn": "9",
+        "pages": "80--92",
+        "other_ids": {
+            "doi": [
+                "10.1109/TWC.2014.2360196"
+            ],
+        }
+    }
+    """
+    def __init__(
+        self,
+        bib_id: str,
+        title: str,
+        authors: List[Dict[str, str]],
+        ref_id: Optional[str] = None,
+        year: Optional[int] = None,
+        venue: Optional[str] = None,
+        volume: Optional[str] = None,
+        issue: Optional[str] = None,
+        pages: Optional[str] = None,
+        other_ids: Optional[Dict[str, List]] = None,
+        num: Optional[int] = None,
+        urls: Optional[List] = None,
+        raw_text: Optional[str] = None,
+        links: Optional[List] = None,
+    ):
+        self.bib_id = bib_id
+        self.ref_id = ref_id
+        self.title = title
+        self.authors = authors
+        self.year = year
+        self.venue = venue
+        self.volume = volume
+        self.issue = issue
+        self.pages = pages
+        self.other_ids = other_ids
+        self.num = num
+        self.urls = urls
+        self.raw_text = raw_text
+        self.links = links
+    def as_json(self):
+        return {
+            "ref_id": self.ref_id,
+            "title": self.title,
+            "authors": self.authors,
+            "year": self.year,
+            "venue": self.venue,
+            "volume": self.volume,
+            "issue": self.issue,
+            "pages": self.pages,
+            "other_ids": self.other_ids,
+            "num": self.num,
+            "urls": self.urls,
+            "raw_text": self.raw_text,
+            "links": self.links,
+        }
+class Affiliation:
+    """
+    Class for representing affiliation info
+    Example:
+        {
+            "laboratory": "Key Laboratory of Urban Environment and Health",
+            "institution": "Chinese Academy of Sciences",
+            "location": {
+              "postCode": "361021",
+              "settlement": "Xiamen",
+              "country": "People's Republic of China"
+        }
+    """
+    def __init__(self, laboratory: str, institution: str, location: Dict):
+        self.laboratory = laboratory
+        self.institution = institution
+        self.location = location
+    def as_json(self):
+        return {
+            "laboratory": self.laboratory,
+            "institution": self.institution,
+            "location": self.location,
+        }
+class Author:
+    """
+    Class for representing paper authors
+    Example:
+        {
+          "first": "Anyi",
+          "middle": [],
+          "last": "Hu",
+          "suffix": "",
+          "affiliation": {
+            "laboratory": "Key Laboratory of Urban Environment and Health",
+            "institution": "Chinese Academy of Sciences",
+            "location": {
+              "postCode": "361021",
+              "settlement": "Xiamen",
+              "country": "People's Republic of China"
+            }
+          },
+          "email": ""
+        }
+    """
+    def __init__(
+        self,
+        first: str,
+        middle: List[str],
+        last: str,
+        suffix: str,
+        affiliation: Optional[Dict] = None,
+        email: Optional[str] = None,
+    ):
+        self.first = first
+        self.middle = middle
+        self.last = last
+        self.suffix = suffix
+        self.affiliation = Affiliation(**affiliation) if affiliation else {}
+        self.email = email
+    def as_json(self):
+        return {
+            "first": self.first,
+            "middle": self.middle,
+            "last": self.last,
+            "suffix": self.suffix,
+            "affiliation": self.affiliation.as_json() if self.affiliation else {},
+            "email": self.email,
+        }
+class Metadata:
+    """
+    Class for representing paper metadata
+    Example:
+    {
+      "title": "Niche Partitioning...",
+      "authors": [
+        {
+          "first": "Anyi",
+          "middle": [],
+          "last": "Hu",
+          "suffix": "",
+          "affiliation": {
+            "laboratory": "Key Laboratory of Urban Environment and Health",
+            "institution": "Chinese Academy of Sciences",
+            "location": {
+              "postCode": "361021",
+              "settlement": "Xiamen",
+              "country": "People's Republic of China"
+            }
+          },
+          "email": ""
+        }
+      ],
+      "year": "2011-11"
+    }
+    """
+    def __init__(
+        self,
+        title: str,
+        authors: List[Dict],
+        year: Optional[str] = None,
+        venue: Optional[str] = None,
+        identifiers: Optional[Dict] = {},
+    ):
+        self.title = title
+        self.authors = [Author(**author) for author in authors]
+        self.year = year
+        self.venue = venue
+        self.identifiers = identifiers
+    def as_json(self):
+        return {
+            "title": self.title,
+            "authors": [author.as_json() for author in self.authors],
+            "year": self.year,
+            "venue": self.venue,
+            "identifiers": self.identifiers,
+        }
+class Paragraph:
+    """
+    Class for representing a parsed paragraph from Grobid xml
+    All xml tags are removed from the paragraph text, all figures, equations, and tables are replaced
+    with a special token that maps to a reference identifier
+    Citation mention spans and section header are extracted
+    An example json representation (values are examples, not accurate):
+    {
+        "text": "Formal language techniques BID1 may be used to study FORMULA0 (see REF0)...",
+        "mention_spans": [
+            {
+                "start": 27,
+                "end": 31,
+                "text": "[1]")
+        ],
+        "ref_spans": [
+            {
+                "start": ,
+                "end": ,
+                "text": "Fig. 1"
+            }
+        ],
+        "eq_spans": [
+            {
+                "start": 53,
+                "end": 61,
+                "text": "α = 1",
+                "latex": "\\alpha = 1",
+                "ref_id": null
+            }
+        ],
+        "section": "Abstract"
+    }
+    """
+    def __init__(
+        self,
+        text: str,
+        cite_spans: List[Dict],
+        ref_spans: List[Dict],
+        eq_spans: Optional[List[Dict]] = [],
+        section: Optional[Any] = None,
+        sec_num: Optional[Any] = None,
+    ):
+        self.text = text
+        self.cite_spans = cite_spans
+        self.ref_spans = ref_spans
+        self.eq_spans = eq_spans
+        if type(section) is str:
+            if section:
+                sec_parts = section.split("::")
+                section_list = [[None, sec_name] for sec_name in sec_parts]
+            else:
+                section_list = None
+            if section_list and sec_num:
+                section_list[-1][0] = sec_num
+        else:
+            section_list = section
+        self.section = section_list
+    def as_json(self):
+        return {
+            "text": self.text,
+            "cite_spans": self.cite_spans,
+            "ref_spans": self.ref_spans,
+            "eq_spans": self.eq_spans,
+            "section": "::".join([sec[1] for sec in self.section]) if self.section else "",
+            "sec_num": self.section[-1][0] if self.section else None,
+        }
+class Paper:
+    """
+    Class for representing a parsed S2ORC paper
+    """
+    def __init__(
+        self,
+        paper_id: str,
+        pdf_hash: str,
+        metadata: Dict,
+        abstract: List[Dict],
+        body_text: List[Dict],
+        back_matter: List[Dict],
+        bib_entries: Dict,
+        ref_entries: Dict,
+    ):
+        self.paper_id = paper_id
+        self.pdf_hash = pdf_hash
+        self.metadata = Metadata(**metadata)
+        self.abstract = [Paragraph(**para) for para in abstract]
+        self.body_text = [Paragraph(**para) for para in body_text]
+        self.back_matter = [Paragraph(**para) for para in back_matter]
+        self.bib_entries = [
+            BibliographyEntry(
+                bib_id=key,
+                **{
+                    CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v
+                    for k, v in bib.items()
+                    if k not in SKIP_KEYS
+                },
+            )
+            for key, bib in bib_entries.items()
+        ]
+        self.ref_entries = [
+            ReferenceEntry(
+                ref_id=key,
+                **{
+                    CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v
+                    for k, v in ref.items()
+                    if k != "ref_id"
+                },
+            )
+            for key, ref in ref_entries.items()
+        ]
+    def as_json(self):
+        return {
+            "paper_id": self.paper_id,
+            "pdf_hash": self.pdf_hash,
+            "metadata": self.metadata.as_json(),
+            "abstract": [para.as_json() for para in self.abstract],
+            "body_text": [para.as_json() for para in self.body_text],
+            "back_matter": [para.as_json() for para in self.back_matter],
+            "bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
+            "ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries},
+        }
+    @property
+    def raw_abstract_text(self) -> str:
+        """
+        Get all the body text joined by a newline
+        :return:
+        """
+        return "\n".join([para.text for para in self.abstract])
+    @property
+    def raw_body_text(self) -> str:
+        """
+        Get all the body text joined by a newline
+        :return:
+        """
+        return "\n".join([para.text for para in self.body_text])
+    def release_json(self, doc_type: str = "pdf") -> Dict:
+        """
+        Return in release JSON format
+        :return:
+        """
+        # TODO: not fully implemented; metadata format is not right; extra keys in some places
+        release_dict: Dict = {"paper_id": self.paper_id}
+        release_dict.update(
+            {
+                "header": {
+                    "generated_with": f"{S2ORC_NAME_STRING} {S2ORC_VERSION_STRING}",
+                    "date_generated": datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
+                }
+            }
+        )
+        release_dict.update(self.metadata.as_json())
+        release_dict.update({"abstract": self.raw_abstract_text})
+        release_dict.update(
+            {
+                f"{doc_type}_parse": {
+                    "paper_id": self.paper_id,
+                    "_pdf_hash": self.pdf_hash,
+                    "abstract": [para.as_json() for para in self.abstract],
+                    "body_text": [para.as_json() for para in self.body_text],
+                    "back_matter": [para.as_json() for para in self.back_matter],
+                    "bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
+                    "ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries},
+                }
+            }
+        )
+        return release_dict

src/utils/pdf_utils/s2orc_utils.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from typing import Any, Dict
+from .s2orc_paper import METADATA_KEYS, Paper
+def load_s2orc(paper_dict: Dict[str, Any]) -> Paper:
+    """
+    Load release S2ORC into Paper class
+    :param paper_dict:
+    :return:
+    """
+    paper_id = paper_dict["paper_id"]
+    pdf_hash = paper_dict.get("_pdf_hash", paper_dict.get("s2_pdf_hash", None))
+    # 2019 gorc parses
+    grobid_parse = paper_dict.get("grobid_parse")
+    if grobid_parse:
+        metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS}
+        abstract = grobid_parse.get("abstract", [])
+        body_text = grobid_parse.get("body_text", [])
+        back_matter = grobid_parse.get("back_matter", [])
+        bib_entries = grobid_parse.get("bib_entries", {})
+        for k, v in bib_entries.items():
+            if "link" in v:
+                v["links"] = [v["link"]]
+        ref_entries = grobid_parse.get("ref_entries", {})
+    # current and 2020 s2orc release_json
+    elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or (
+        "body_text" in paper_dict and paper_dict.get("body_text")
+    ):
+        if "pdf_parse" in paper_dict:
+            paper_dict = paper_dict["pdf_parse"]
+        if paper_dict.get("metadata"):
+            metadata = {
+                k: v for k, v in paper_dict.get("metadata", {}).items() if k in METADATA_KEYS
+            }
+        # 2020 s2orc releases (metadata is separate)
+        else:
+            metadata = {"title": None, "authors": [], "year": None}
+        abstract = paper_dict.get("abstract", [])
+        body_text = paper_dict.get("body_text", [])
+        back_matter = paper_dict.get("back_matter", [])
+        bib_entries = paper_dict.get("bib_entries", {})
+        for k, v in bib_entries.items():
+            if "link" in v:
+                v["links"] = [v["link"]]
+        ref_entries = paper_dict.get("ref_entries", {})
+    else:
+        print(paper_id)
+        raise NotImplementedError("Unknown S2ORC file type!")
+    return Paper(
+        paper_id=paper_id,
+        pdf_hash=pdf_hash,
+        metadata=metadata,
+        abstract=abstract,
+        body_text=body_text,
+        back_matter=back_matter,
+        bib_entries=bib_entries,
+        ref_entries=ref_entries,
+    )

src/utils/pdf_utils/utils.py ADDED Viewed

	@@ -0,0 +1,904 @@

+import re
+from typing import Dict, List, Tuple
+import bs4
+def replace_refspans(
+    spans_to_replace: List[Tuple[int, int, str, str]],
+    full_string: str,
+    pre_padding: str = "",
+    post_padding: str = "",
+    btwn_padding: str = ", ",
+) -> str:
+    """
+    For each span within the full string, replace that span with new text
+    :param spans_to_replace: list of tuples of form (start_ind, end_ind, span_text, new_substring)
+    :param full_string:
+    :param pre_padding:
+    :param post_padding:
+    :param btwn_padding:
+    :return:
+    """
+    # assert all spans are equal to full_text span
+    assert all([full_string[start:end] == span for start, end, span, _ in spans_to_replace])
+    # assert none of the spans start with the same start ind
+    start_inds = [rep[0] for rep in spans_to_replace]
+    assert len(set(start_inds)) == len(start_inds)
+    # sort by start index
+    spans_to_replace.sort(key=lambda x: x[0])
+    # form strings for each span group
+    for i, entry in enumerate(spans_to_replace):
+        start, end, span, new_string = entry
+        # skip empties
+        if end <= 0:
+            continue
+        # compute shift amount
+        shift_amount = len(new_string) - len(span) + len(pre_padding) + len(post_padding)
+        # shift remaining appropriately
+        for ind in range(i + 1, len(spans_to_replace)):
+            next_start, next_end, next_span, next_string = spans_to_replace[ind]
+            # skip empties
+            if next_end <= 0:
+                continue
+            # if overlap between ref span and current ref span, remove from replacement
+            if next_start < end:
+                next_start = 0
+                next_end = 0
+                next_string = ""
+            # if ref span abuts previous reference span
+            elif next_start == end:
+                next_start += shift_amount
+                next_end += shift_amount
+                next_string = btwn_padding + pre_padding + next_string + post_padding
+            # if ref span starts after, shift starts and ends
+            elif next_start > end:
+                next_start += shift_amount
+                next_end += shift_amount
+                next_string = pre_padding + next_string + post_padding
+            # save adjusted span
+            spans_to_replace[ind] = (next_start, next_end, next_span, next_string)
+    spans_to_replace = [entry for entry in spans_to_replace if entry[1] > 0]
+    spans_to_replace.sort(key=lambda x: x[0])
+    # apply shifts in series
+    for start, end, span, new_string in spans_to_replace:
+        assert full_string[start:end] == span
+        full_string = full_string[:start] + new_string + full_string[end:]
+    return full_string
+BRACKET_REGEX = re.compile(r"\[[1-9]\d{0,2}([,;\-\s]+[1-9]\d{0,2})*;?\]")
+BRACKET_STYLE_THRESHOLD = 5
+SINGLE_BRACKET_REGEX = re.compile(r"\[([1-9]\d{0,2})\]")
+EXPANSION_CHARS = {"-", "–"}
+REPLACE_TABLE_TOKS = {
+    "<row>": "<tr>",
+    "<row/>": "<tr/>",
+    "</row>": "</tr>",
+    "<cell>": "<td>",
+    "<cell/>": "<td/>",
+    "</cell>": "</td>",
+    "<cell ": "<td ",
+    "cols=": "colspan=",
+}
+def span_already_added(sub_start: int, sub_end: int, span_indices: List[Tuple[int, int]]) -> bool:
+    """
+    Check if span is a subspan of existing span
+    :param sub_start:
+    :param sub_end:
+    :param span_indices:
+    :return:
+    """
+    for span_start, span_end in span_indices:
+        if sub_start >= span_start and sub_end <= span_end:
+            return True
+    return False
+def is_expansion_string(between_string: str) -> bool:
+    """
+    Check if the string between two refs is an expansion string
+    :param between_string:
+    :return:
+    """
+    if (
+        len(between_string) <= 2
+        and any([c in EXPANSION_CHARS for c in between_string])
+        and all([c in EXPANSION_CHARS.union({" "}) for c in between_string])
+    ):
+        return True
+    return False
+# TODO: still cases like `09bcee03baceb509d4fcf736fa1322cb8adf507f` w/ dups like ['L Jung', 'R Hessler', 'Louis Jung', 'Roland Hessler']
+# example paper that has empties & duplicates: `09bce26cc7e825e15a4469e3e78b7a54898bb97f`
+def _clean_empty_and_duplicate_authors_from_grobid_parse(
+    authors: List[Dict],
+) -> List[Dict]:
+    """
+    Within affiliation, `location` is a dict with fields <settlement>, <region>, <country>, <postCode>, etc.
+    Too much hassle, so just take the first one that's not empty.
+    """
+    # stripping empties
+    clean_authors_list = []
+    for author in authors:
+        clean_first = author["first"].strip()
+        clean_last = author["last"].strip()
+        clean_middle = [m.strip() for m in author["middle"]]
+        clean_suffix = author["suffix"].strip()
+        if clean_first or clean_last or clean_middle:
+            author["first"] = clean_first
+            author["last"] = clean_last
+            author["middle"] = clean_middle
+            author["suffix"] = clean_suffix
+            clean_authors_list.append(author)
+    # combining duplicates (preserve first occurrence of author name as position)
+    key_to_author_blobs = {}
+    ordered_keys_by_author_pos = []
+    for author in clean_authors_list:
+        key = (
+            author["first"],
+            author["last"],
+            " ".join(author["middle"]),
+            author["suffix"],
+        )
+        if key not in key_to_author_blobs:
+            key_to_author_blobs[key] = author
+            ordered_keys_by_author_pos.append(key)
+        else:
+            if author["email"]:
+                key_to_author_blobs[key]["email"] = author["email"]
+            if author["affiliation"] and (
+                author["affiliation"]["institution"]
+                or author["affiliation"]["laboratory"]
+                or author["affiliation"]["location"]
+            ):
+                key_to_author_blobs[key]["affiliation"] = author["affiliation"]
+    dedup_authors_list = [key_to_author_blobs[key] for key in ordered_keys_by_author_pos]
+    return dedup_authors_list
+def sub_spans_and_update_indices(
+    spans_to_replace: List[Tuple[int, int, str, str]], full_string: str
+) -> Tuple[str, List]:
+    """
+    Replace all spans and recompute indices
+    :param spans_to_replace:
+    :param full_string:
+    :return:
+    """
+    # TODO: check no spans overlapping
+    # TODO: check all spans well-formed
+    # assert all spans are equal to full_text span
+    assert all([full_string[start:end] == token for start, end, token, _ in spans_to_replace])
+    # assert none of the spans start with the same start ind
+    start_inds = [rep[0] for rep in spans_to_replace]
+    assert len(set(start_inds)) == len(start_inds)
+    # sort by start index
+    spans_to_replace.sort(key=lambda x: x[0])
+    # compute offsets for each span
+    new_spans = [
+        (start, end, token, surface, 0) for start, end, token, surface in spans_to_replace
+    ]
+    for i, entry in enumerate(spans_to_replace):
+        start, end, token, surface = entry
+        new_end = start + len(surface)
+        offset = new_end - end
+        # new_spans[i][1] += offset
+        new_spans[i] = (
+            new_spans[i][0],
+            new_spans[i][1] + offset,
+            new_spans[i][2],
+            new_spans[i][3],
+            new_spans[i][4],
+        )
+        # for new_span_entry in new_spans[i + 1 :]:
+        #    new_span_entry[4] += offset
+        for j in range(i + 1, len(new_spans)):
+            new_spans[j] = (
+                new_spans[j][0],
+                new_spans[j][1],
+                new_spans[j][2],
+                new_spans[j][3],
+                new_spans[j][4] + offset,
+            )
+    # generate new text and create final spans
+    new_text = replace_refspans(spans_to_replace, full_string, btwn_padding="")
+    result = [
+        (start + offset, end + offset, token, surface)
+        for start, end, token, surface, offset in new_spans
+    ]
+    return new_text, result
+class UniqTokenGenerator:
+    """
+    Generate unique token
+    """
+    def __init__(self, tok_string):
+        self.tok_string = tok_string
+        self.ind = 0
+    def __iter__(self):
+        return self
+    def __next__(self):
+        return self.next()
+    def next(self):
+        new_token = f"{self.tok_string}{self.ind}"
+        self.ind += 1
+        return new_token
+def normalize_grobid_id(grobid_id: str):
+    """
+    Normalize grobid object identifiers
+    :param grobid_id:
+    :return:
+    """
+    str_norm = grobid_id.upper().replace("_", "").replace("#", "")
+    if str_norm.startswith("B"):
+        return str_norm.replace("B", "BIBREF")
+    if str_norm.startswith("TAB"):
+        return str_norm.replace("TAB", "TABREF")
+    if str_norm.startswith("FIG"):
+        return str_norm.replace("FIG", "FIGREF")
+    if str_norm.startswith("FORMULA"):
+        return str_norm.replace("FORMULA", "EQREF")
+    return str_norm
+def extract_formulas_from_tei_xml(sp: bs4.BeautifulSoup) -> None:
+    """
+    Replace all formulas with the text
+    :param sp:
+    :return:
+    """
+    for eq in sp.find_all("formula"):
+        eq.replace_with(sp.new_string(eq.text.strip()))
+def table_to_html(table: bs4.element.Tag) -> str:
+    """
+    Sub table tags with html table tags
+    :param table_str:
+    :return:
+    """
+    for tag in table:
+        if tag.name != "row":
+            print(f"Unknown table subtag: {tag.name}")
+            tag.decompose()
+    table_str = str(table)
+    for token, subtoken in REPLACE_TABLE_TOKS.items():
+        table_str = table_str.replace(token, subtoken)
+    return table_str
+def extract_figures_and_tables_from_tei_xml(sp: bs4.BeautifulSoup) -> Dict[str, Dict]:
+    """
+    Generate figure and table dicts
+    :param sp:
+    :return:
+    """
+    ref_map = dict()
+    for fig in sp.find_all("figure"):
+        try:
+            if fig.name and fig.get("xml:id"):
+                if fig.get("type") == "table":
+                    ref_map[normalize_grobid_id(fig.get("xml:id"))] = {
+                        "text": (
+                            fig.figDesc.text.strip()
+                            if fig.figDesc
+                            else fig.head.text.strip() if fig.head else ""
+                        ),
+                        "latex": None,
+                        "type": "table",
+                        "content": table_to_html(fig.table),
+                        "fig_num": fig.get("xml:id"),
+                    }
+                else:
+                    if True in [char.isdigit() for char in fig.findNext("head").findNext("label")]:
+                        fig_num = fig.findNext("head").findNext("label").contents[0]
+                    else:
+                        fig_num = None
+                    ref_map[normalize_grobid_id(fig.get("xml:id"))] = {
+                        "text": fig.figDesc.text.strip() if fig.figDesc else "",
+                        "latex": None,
+                        "type": "figure",
+                        "content": "",
+                        "fig_num": fig_num,
+                    }
+        except AttributeError:
+            continue
+        fig.decompose()
+    return ref_map
+def check_if_citations_are_bracket_style(sp: bs4.BeautifulSoup) -> bool:
+    """
+    Check if the document has bracket style citations
+    :param sp:
+    :return:
+    """
+    cite_strings = []
+    if sp.body:
+        for div in sp.body.find_all("div"):
+            if div.head:
+                continue
+            for rtag in div.find_all("ref"):
+                ref_type = rtag.get("type")
+                if ref_type == "bibr":
+                    cite_strings.append(rtag.text.strip())
+        # check how many match bracket style
+        bracket_style = [bool(BRACKET_REGEX.match(cite_str)) for cite_str in cite_strings]
+        # return true if
+        if sum(bracket_style) > BRACKET_STYLE_THRESHOLD:
+            return True
+    return False
+def sub_all_note_tags(sp: bs4.BeautifulSoup) -> bs4.BeautifulSoup:
+    """
+    Sub all note tags with p tags
+    :param para_el:
+    :param sp:
+    :return:
+    """
+    for ntag in sp.find_all("note"):
+        p_tag = sp.new_tag("p")
+        p_tag.string = ntag.text.strip()
+        ntag.replace_with(p_tag)
+    return sp
+def process_formulas_in_paragraph(para_el: bs4.BeautifulSoup, sp: bs4.BeautifulSoup) -> None:
+    """
+    Process all formulas in paragraph and replace with text and label
+    :param para_el:
+    :param sp:
+    :return:
+    """
+    for ftag in para_el.find_all("formula"):
+        # get label if exists and insert a space between formula and label
+        if ftag.label:
+            label = " " + ftag.label.text
+            ftag.label.decompose()
+        else:
+            label = ""
+        ftag.replace_with(sp.new_string(f"{ftag.text.strip()}{label}"))
+def process_references_in_paragraph(
+    para_el: bs4.BeautifulSoup, sp: bs4.BeautifulSoup, refs: Dict
+) -> Dict:
+    """
+    Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form)
+    :param para_el:
+    :param sp:
+    :param refs:
+    :return:
+    """
+    tokgen = UniqTokenGenerator("REFTOKEN")
+    ref_dict = dict()
+    for rtag in para_el.find_all("ref"):
+        try:
+            ref_type = rtag.get("type")
+            # skip if citation
+            if ref_type == "bibr":
+                continue
+            if ref_type == "table" or ref_type == "figure":
+                ref_id = rtag.get("target")
+                if ref_id and normalize_grobid_id(ref_id) in refs:
+                    # normalize reference string
+                    rtag_string = normalize_grobid_id(ref_id)
+                else:
+                    rtag_string = None
+                # add to ref set
+                ref_key = tokgen.next()
+                ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type)
+                rtag.replace_with(sp.new_string(f" {ref_key} "))
+            else:
+                # replace with surface form
+                rtag.replace_with(sp.new_string(rtag.text.strip()))
+        except AttributeError:
+            continue
+    return ref_dict
+def process_citations_in_paragraph(
+    para_el: bs4.BeautifulSoup, sp: bs4.BeautifulSoup, bibs: Dict, bracket: bool
+) -> Dict:
+    """
+    Process all citations in paragraph and generate a dict for surface forms
+    :param para_el:
+    :param sp:
+    :param bibs:
+    :param bracket:
+    :return:
+    """
+    # CHECK if range between two surface forms is appropriate for bracket style expansion
+    def _get_surface_range(start_surface, end_surface):
+        span1_match = SINGLE_BRACKET_REGEX.match(start_surface)
+        span2_match = SINGLE_BRACKET_REGEX.match(end_surface)
+        if span1_match and span2_match:
+            # get numbers corresponding to citations
+            span1_num = int(span1_match.group(1))
+            span2_num = int(span2_match.group(1))
+            # expand if range is between 1 and 20
+            if 1 < span2_num - span1_num < 20:
+                return span1_num, span2_num
+        return None
+    # CREATE BIBREF range between two reference ids, e.g. BIBREF1-BIBREF4 -> BIBREF1 BIBREF2 BIBREF3 BIBREF4
+    def _create_ref_id_range(start_ref_id, end_ref_id):
+        start_ref_num = int(start_ref_id[6:])
+        end_ref_num = int(end_ref_id[6:])
+        return [f"BIBREF{curr_ref_num}" for curr_ref_num in range(start_ref_num, end_ref_num + 1)]
+    # CREATE surface form range between two bracket strings, e.g. [1]-[4] -> [1] [2] [3] [4]
+    def _create_surface_range(start_number, end_number):
+        return [f"[{n}]" for n in range(start_number, end_number + 1)]
+    # create citation dict with keywords
+    cite_map = dict()
+    tokgen = UniqTokenGenerator("CITETOKEN")
+    for rtag in para_el.find_all("ref"):
+        try:
+            # get surface span, e.g. [3]
+            surface_span = rtag.text.strip()
+            # check if target is available (#b2 -> BID2)
+            if rtag.get("target"):
+                # normalize reference string
+                rtag_ref_id = normalize_grobid_id(rtag.get("target"))
+                # skip if rtag ref_id not in bibliography
+                if rtag_ref_id not in bibs:
+                    cite_key = tokgen.next()
+                    rtag.replace_with(sp.new_string(f" {cite_key} "))
+                    cite_map[cite_key] = (None, surface_span)
+                    continue
+                # if bracket style, only keep if surface form is bracket
+                if bracket:
+                    # valid bracket span
+                    if surface_span and (
+                        surface_span[0] == "["
+                        or surface_span[-1] == "]"
+                        or surface_span[-1] == ","
+                    ):
+                        pass
+                    # invalid, replace tag with surface form and continue to next ref tag
+                    else:
+                        rtag.replace_with(sp.new_string(f" {surface_span} "))
+                        continue
+                # not bracket, add cite span and move on
+                else:
+                    cite_key = tokgen.next()
+                    rtag.replace_with(sp.new_string(f" {cite_key} "))
+                    cite_map[cite_key] = (rtag_ref_id, surface_span)
+                    continue
+                # EXTRA PROCESSING FOR BRACKET STYLE CITATIONS; EXPAND RANGES ###
+                # look backward for range marker, e.g. [1]-*[3]*
+                backward_between_span = ""
+                for sib in rtag.previous_siblings:
+                    if sib.name == "ref":
+                        break
+                    elif type(sib) is bs4.NavigableString:
+                        backward_between_span += sib
+                    else:
+                        break
+                # check if there's a backwards expansion, e.g. need to expand [1]-[3] -> [1] [2] [3]
+                if is_expansion_string(backward_between_span):
+                    # get surface number range
+                    surface_num_range = _get_surface_range(
+                        rtag.find_previous_sibling("ref").text.strip(), surface_span
+                    )
+                    # if the surface number range is reasonable (range < 20, in order), EXPAND
+                    if surface_num_range:
+                        # delete previous ref tag and anything in between (i.e. delete "-" and extra spaces)
+                        for sib in rtag.previous_siblings:
+                            if sib.name == "ref":
+                                break
+                            elif type(sib) is bs4.NavigableString:
+                                sib.replace_with(sp.new_string(""))
+                            else:
+                                break
+                        # get ref id of previous ref, e.g. [1] (#b0 -> BID0)
+                        previous_rtag = rtag.find_previous_sibling("ref")
+                        previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get("target"))
+                        previous_rtag.decompose()
+                        # replace this ref tag with the full range expansion, e.g. [3] (#b2 -> BID1 BID2)
+                        id_range = _create_ref_id_range(previous_rtag_ref_id, rtag_ref_id)
+                        surface_range = _create_surface_range(
+                            surface_num_range[0], surface_num_range[1]
+                        )
+                        replace_string = ""
+                        for range_ref_id, range_surface_form in zip(id_range, surface_range):
+                            # only replace if ref id is in bibliography, else add none
+                            if range_ref_id in bibs:
+                                cite_key = tokgen.next()
+                                cite_map[cite_key] = (range_ref_id, range_surface_form)
+                            else:
+                                cite_key = tokgen.next()
+                                cite_map[cite_key] = (None, range_surface_form)
+                            replace_string += cite_key + " "
+                        rtag.replace_with(sp.new_string(f" {replace_string} "))
+                    # ELSE do not expand backwards and replace previous and current rtag with appropriate ref id
+                    else:
+                        # add mapping between ref id and surface form for previous ref tag
+                        previous_rtag = rtag.find_previous_sibling("ref")
+                        previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get("target"))
+                        previous_rtag_surface = previous_rtag.text.strip()
+                        cite_key = tokgen.next()
+                        previous_rtag.replace_with(sp.new_string(f" {cite_key} "))
+                        cite_map[cite_key] = (
+                            previous_rtag_ref_id,
+                            previous_rtag_surface,
+                        )
+                        # add mapping between ref id and surface form for current reftag
+                        cite_key = tokgen.next()
+                        rtag.replace_with(sp.new_string(f" {cite_key} "))
+                        cite_map[cite_key] = (rtag_ref_id, surface_span)
+                else:
+                    # look forward and see if expansion string, e.g. *[1]*-[3]
+                    forward_between_span = ""
+                    for sib in rtag.next_siblings:
+                        if sib.name == "ref":
+                            break
+                        elif type(sib) is bs4.NavigableString:
+                            forward_between_span += sib
+                        else:
+                            break
+                    # look forward for range marker (if is a range, continue -- range will be expanded
+                    # when we get to the second value)
+                    if is_expansion_string(forward_between_span):
+                        continue
+                    # else treat like normal reference
+                    else:
+                        cite_key = tokgen.next()
+                        rtag.replace_with(sp.new_string(f" {cite_key} "))
+                        cite_map[cite_key] = (rtag_ref_id, surface_span)
+            else:
+                cite_key = tokgen.next()
+                rtag.replace_with(sp.new_string(f" {cite_key} "))
+                cite_map[cite_key] = (None, surface_span)
+        except AttributeError:
+            continue
+    return cite_map
+def process_paragraph(
+    sp: bs4.BeautifulSoup,
+    para_el: bs4.element.Tag,
+    section_names: List[Tuple],
+    bib_dict: Dict,
+    ref_dict: Dict,
+    bracket: bool,
+) -> Dict:
+    """
+    Process one paragraph
+    :param sp:
+    :param para_el:
+    :param section_names:
+    :param bib_dict:
+    :param ref_dict:
+    :param bracket: if bracket style, expand and clean up citations
+    :return:
+    """
+    # return empty paragraph if no text
+    if not para_el.text:
+        return {
+            "text": "",
+            "cite_spans": [],
+            "ref_spans": [],
+            "eq_spans": [],
+            "section": section_names,
+        }
+    # replace formulas with formula text
+    process_formulas_in_paragraph(para_el, sp)
+    # get references to tables and figures
+    ref_map = process_references_in_paragraph(para_el, sp, ref_dict)
+    # generate citation map for paragraph element (keep only cite spans with bib entry or unlinked)
+    cite_map = process_citations_in_paragraph(para_el, sp, bib_dict, bracket)
+    # substitute space characters
+    para_text = re.sub(r"\s+", " ", para_el.text)
+    para_text = re.sub(r"\s", " ", para_text)
+    # get all cite and ref spans
+    all_spans_to_replace = []
+    for span in re.finditer(r"(CITETOKEN\d+)", para_text):
+        uniq_token = span.group()
+        ref_id, surface_text = cite_map[uniq_token]
+        all_spans_to_replace.append(
+            (span.start(), span.start() + len(uniq_token), uniq_token, surface_text)
+        )
+    for span in re.finditer(r"(REFTOKEN\d+)", para_text):
+        uniq_token = span.group()
+        ref_id, surface_text, ref_type = ref_map[uniq_token]
+        all_spans_to_replace.append(
+            (span.start(), span.start() + len(uniq_token), uniq_token, surface_text)
+        )
+    # replace cite and ref spans and create json blobs
+    para_text, all_spans_to_replace = sub_spans_and_update_indices(all_spans_to_replace, para_text)
+    cite_span_blobs = [
+        {"start": start, "end": end, "text": surface, "ref_id": cite_map[token][0]}
+        for start, end, token, surface in all_spans_to_replace
+        if token.startswith("CITETOKEN")
+    ]
+    ref_span_blobs = [
+        {"start": start, "end": end, "text": surface, "ref_id": ref_map[token][0]}
+        for start, end, token, surface in all_spans_to_replace
+        if token.startswith("REFTOKEN")
+    ]
+    for cite_blob in cite_span_blobs:
+        assert para_text[cite_blob["start"] : cite_blob["end"]] == cite_blob["text"]
+    for ref_blob in ref_span_blobs:
+        assert para_text[ref_blob["start"] : ref_blob["end"]] == ref_blob["text"]
+    return {
+        "text": para_text,
+        "cite_spans": cite_span_blobs,
+        "ref_spans": ref_span_blobs,
+        "eq_spans": [],
+        "section": section_names,
+    }
+def extract_abstract_from_tei_xml(
+    sp: bs4.BeautifulSoup, bib_dict: Dict, ref_dict: Dict, cleanup_bracket: bool
+) -> List[Dict]:
+    """
+    Parse abstract from soup
+    :param sp:
+    :param bib_dict:
+    :param ref_dict:
+    :param cleanup_bracket:
+    :return:
+    """
+    abstract_text = []
+    if sp.abstract:
+        # process all divs
+        if sp.abstract.div:
+            for div in sp.abstract.find_all("div"):
+                if div.text:
+                    if div.p:
+                        for para in div.find_all("p"):
+                            if para.text:
+                                abstract_text.append(
+                                    process_paragraph(
+                                        sp,
+                                        para,
+                                        [(None, "Abstract")],
+                                        bib_dict,
+                                        ref_dict,
+                                        cleanup_bracket,
+                                    )
+                                )
+                    else:
+                        if div.text:
+                            abstract_text.append(
+                                process_paragraph(
+                                    sp,
+                                    div,
+                                    [(None, "Abstract")],
+                                    bib_dict,
+                                    ref_dict,
+                                    cleanup_bracket,
+                                )
+                            )
+        # process all paragraphs
+        elif sp.abstract.p:
+            for para in sp.abstract.find_all("p"):
+                if para.text:
+                    abstract_text.append(
+                        process_paragraph(
+                            sp,
+                            para,
+                            [(None, "Abstract")],
+                            bib_dict,
+                            ref_dict,
+                            cleanup_bracket,
+                        )
+                    )
+        # else just try to get the text
+        else:
+            if sp.abstract.text:
+                abstract_text.append(
+                    process_paragraph(
+                        sp,
+                        sp.abstract,
+                        [(None, "Abstract")],
+                        bib_dict,
+                        ref_dict,
+                        cleanup_bracket,
+                    )
+                )
+        sp.abstract.decompose()
+    return abstract_text
+def extract_body_text_from_div(
+    sp: bs4.BeautifulSoup,
+    div: bs4.element.Tag,
+    sections: List[Tuple],
+    bib_dict: Dict,
+    ref_dict: Dict,
+    cleanup_bracket: bool,
+) -> List[Dict]:
+    """
+    Parse body text from soup
+    :param sp:
+    :param div:
+    :param sections:
+    :param bib_dict:
+    :param ref_dict:
+    :param cleanup_bracket:
+    :return:
+    """
+    chunks = []
+    # check if nested divs; recursively process
+    if div.div:
+        for subdiv in div.find_all("div"):
+            # has header, add to section list and process
+            if subdiv.head:
+                chunks += extract_body_text_from_div(
+                    sp,
+                    subdiv,
+                    sections + [(subdiv.head.get("n", None), subdiv.head.text.strip())],
+                    bib_dict,
+                    ref_dict,
+                    cleanup_bracket,
+                )
+                subdiv.head.decompose()
+            # no header, process with same section list
+            else:
+                chunks += extract_body_text_from_div(
+                    sp, subdiv, sections, bib_dict, ref_dict, cleanup_bracket
+                )
+    # process tags individuals
+    for tag in div:
+        try:
+            if tag.name == "p":
+                if tag.text:
+                    chunks.append(
+                        process_paragraph(sp, tag, sections, bib_dict, ref_dict, cleanup_bracket)
+                    )
+            elif tag.name == "formula":
+                # e.g. <formula xml:id="formula_0">Y = W T X.<label>(1)</label></formula>
+                label = tag.label.text
+                tag.label.decompose()
+                eq_text = tag.text
+                chunks.append(
+                    {
+                        "text": "EQUATION",
+                        "cite_spans": [],
+                        "ref_spans": [],
+                        "eq_spans": [
+                            {
+                                "start": 0,
+                                "end": 8,
+                                "text": "EQUATION",
+                                "ref_id": "EQREF",
+                                "raw_str": eq_text,
+                                "eq_num": label,
+                            }
+                        ],
+                        "section": sections,
+                    }
+                )
+        except AttributeError:
+            if tag.text:
+                chunks.append(
+                    process_paragraph(sp, tag, sections, bib_dict, ref_dict, cleanup_bracket)
+                )
+    return chunks
+def extract_body_text_from_tei_xml(
+    sp: bs4.BeautifulSoup, bib_dict: Dict, ref_dict: Dict, cleanup_bracket: bool
+) -> List[Dict]:
+    """
+    Parse body text from soup
+    :param sp:
+    :param bib_dict:
+    :param ref_dict:
+    :param cleanup_bracket:
+    :return:
+    """
+    body_text = []
+    if sp.body:
+        body_text = extract_body_text_from_div(
+            sp, sp.body, [], bib_dict, ref_dict, cleanup_bracket
+        )
+        sp.body.decompose()
+    return body_text
+def extract_back_matter_from_tei_xml(
+    sp: bs4.BeautifulSoup, bib_dict: Dict, ref_dict: Dict, cleanup_bracket: bool
+) -> List[Dict]:
+    """
+    Parse back matter from soup
+    :param sp:
+    :param bib_dict:
+    :param ref_dict:
+    :param cleanup_bracket:
+    :return:
+    """
+    back_text = []
+    if sp.back:
+        for div in sp.back.find_all("div"):
+            if div.get("type"):
+                section_type = div.get("type")
+            else:
+                section_type = ""
+            for child_div in div.find_all("div"):
+                if child_div.head:
+                    section_title = child_div.head.text.strip()
+                    section_num = child_div.head.get("n", None)
+                    child_div.head.decompose()
+                else:
+                    section_title = section_type
+                    section_num = None
+                if child_div.text:
+                    if child_div.text:
+                        back_text.append(
+                            process_paragraph(
+                                sp,
+                                child_div,
+                                [(section_num, section_title)],
+                                bib_dict,
+                                ref_dict,
+                                cleanup_bracket,
+                            )
+                        )
+        sp.back.decompose()
+    return back_text