Spaces:
Sleeping
Sleeping
feat: dctap-python files
Browse files- dctap/__init__.py +12 -0
- dctap/cli.py +97 -0
- dctap/config.py +156 -0
- dctap/csvreader.py +278 -0
- dctap/defaults.py +22 -0
- dctap/exceptions.py +13 -0
- dctap/inspect.py +57 -0
- dctap/loggers.py +43 -0
- dctap/tapclasses.py +298 -0
- dctap/utils.py +113 -0
dctap/__init__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Convert a Tabular Application Profile from CSV into JSON."""
|
2 |
+
|
3 |
+
from dctap.tapclasses import TAPShape, TAPStatementTemplate
|
4 |
+
from dctap.csvreader import csvreader
|
5 |
+
|
6 |
+
__version__ = "0.4.5"
|
7 |
+
|
8 |
+
# Keep version number in sync with:
|
9 |
+
# - https://github.com/dcmi/dctap-python/blob/main/docs/conf.py#L28
|
10 |
+
# ../docs/conf.py
|
11 |
+
# - https://github.com/dcmi/dctap-python/blob/main/dctap/cli.py#L20
|
12 |
+
# ../dctap/cli.py
|
dctap/cli.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""DC Tabular Application Profiles (DCTAP) command-line utility."""
|
2 |
+
|
3 |
+
import sys
|
4 |
+
import json as j
|
5 |
+
from ruamel.yaml import YAML
|
6 |
+
import click
|
7 |
+
from dctap.defaults import CONFIGFILE
|
8 |
+
from dctap.config import get_config, write_configfile
|
9 |
+
from dctap.csvreader import csvreader
|
10 |
+
from dctap.inspect import pprint_tapshapes, print_warnings
|
11 |
+
from dctap.loggers import stderr_logger
|
12 |
+
from dctap.utils import expand_uri_prefixes
|
13 |
+
|
14 |
+
# pylint: disable=unused-argument,no-value-for-parameter
|
15 |
+
# => unused-argument: Allows placeholders for now.
|
16 |
+
# => no-value-for-parameter: Okay in cli.py
|
17 |
+
|
18 |
+
|
19 |
+
@click.group()
|
20 |
+
@click.version_option("0.4.5", help="Show version and exit")
|
21 |
+
@click.help_option(help="Show help and exit")
|
22 |
+
@click.pass_context
|
23 |
+
def cli(context):
|
24 |
+
"""DC Tabular Application Profiles parser and base module
|
25 |
+
|
26 |
+
Examples (see https://dctap-python.rtfd.io):
|
27 |
+
|
28 |
+
\b
|
29 |
+
Write starter config file:
|
30 |
+
$ dctap init # Write dctap.yaml
|
31 |
+
\b
|
32 |
+
Show normalized view of TAP:
|
33 |
+
$ dctap read x.csv # Output as plain text
|
34 |
+
$ dctap read --json x.csv # Output as JSON
|
35 |
+
$ dctap read --yaml x.csv # Output as YAML
|
36 |
+
$ dctap read --warnings x.csv # Also show warnings
|
37 |
+
"""
|
38 |
+
|
39 |
+
|
40 |
+
@cli.command()
|
41 |
+
@click.help_option(help="Show help and exit")
|
42 |
+
@click.pass_context
|
43 |
+
def init(context, hidden):
|
44 |
+
"""Write config file: 'dctap.yaml'."""
|
45 |
+
configfile = CONFIGFILE
|
46 |
+
write_configfile(configfile)
|
47 |
+
|
48 |
+
|
49 |
+
@cli.command()
|
50 |
+
@click.argument("csvfile_obj", type=click.File(mode="r", encoding="utf-8-sig"))
|
51 |
+
@click.option("--config", type=click.Path(exists=True), help="Alternative config file")
|
52 |
+
@click.option("--expand-prefixes", is_flag=True, help="Expand compact IRIs")
|
53 |
+
@click.option("--warnings", is_flag=True, help="Print warnings to stderr")
|
54 |
+
@click.option("--json", is_flag=True, help="Print JSON to stdout")
|
55 |
+
@click.option("--yaml", is_flag=True, help="Print YAML to stdout")
|
56 |
+
@click.help_option(help="Show help and exit")
|
57 |
+
@click.pass_context
|
58 |
+
def read(context, csvfile_obj, config, expand_prefixes, warnings, json, yaml):
|
59 |
+
"""Show TAP as TXT, JSON, or YAML."""
|
60 |
+
# pylint: disable=too-many-locals,too-many-arguments
|
61 |
+
|
62 |
+
if config:
|
63 |
+
config_dict = get_config(nondefault_configfile_name=config)
|
64 |
+
else:
|
65 |
+
config_dict = get_config()
|
66 |
+
tapshapes_dict = csvreader(open_csvfile_obj=csvfile_obj, config_dict=config_dict)
|
67 |
+
|
68 |
+
if expand_prefixes:
|
69 |
+
tapshapes_dict = expand_uri_prefixes(tapshapes_dict, config_dict)
|
70 |
+
|
71 |
+
if json and yaml:
|
72 |
+
# Quick fix for mutually exclusive options, a better fix in future.
|
73 |
+
echo = stderr_logger()
|
74 |
+
echo.warning("Please use either --json or --yaml")
|
75 |
+
click.Context.exit(0)
|
76 |
+
|
77 |
+
if json:
|
78 |
+
if not warnings:
|
79 |
+
del tapshapes_dict["warnings"]
|
80 |
+
json_output = j.dumps(tapshapes_dict, indent=2)
|
81 |
+
print(json_output)
|
82 |
+
|
83 |
+
if yaml:
|
84 |
+
if not warnings:
|
85 |
+
del tapshapes_dict["warnings"]
|
86 |
+
y = YAML()
|
87 |
+
y.indent(mapping=2, sequence=4, offset=2)
|
88 |
+
y.dump(tapshapes_dict, sys.stdout)
|
89 |
+
|
90 |
+
if not (json or yaml):
|
91 |
+
pprint_output = pprint_tapshapes(
|
92 |
+
tapshapes_dict=tapshapes_dict, config_dict=config_dict
|
93 |
+
)
|
94 |
+
for line in pprint_output:
|
95 |
+
print(line, file=sys.stdout)
|
96 |
+
if warnings:
|
97 |
+
print_warnings(tapshapes_dict["warnings"])
|
dctap/config.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Default settings."""
|
2 |
+
|
3 |
+
import sys
|
4 |
+
from dataclasses import asdict
|
5 |
+
from pathlib import Path
|
6 |
+
from dctap.defaults import CONFIGFILE, CONFIGYAML
|
7 |
+
from dctap.exceptions import ConfigError
|
8 |
+
from dctap.tapclasses import TAPShape, TAPStatementTemplate
|
9 |
+
from dctap.utils import load_yaml_to_dict, coerce_concise
|
10 |
+
|
11 |
+
|
12 |
+
def get_config(
|
13 |
+
nondefault_configyaml_str=None,
|
14 |
+
nondefault_configfile_name=None,
|
15 |
+
default_configyaml_str=CONFIGYAML,
|
16 |
+
default_configfile_name=CONFIGFILE,
|
17 |
+
default_shape_class=TAPShape,
|
18 |
+
default_state_class=TAPStatementTemplate,
|
19 |
+
):
|
20 |
+
"""Get configuration dictionary from package defaults (or from non-defaults)."""
|
21 |
+
# pylint: disable=too-many-arguments
|
22 |
+
yamlstring = None
|
23 |
+
configdict_from_yamlstring = None
|
24 |
+
config_dict = _initialize_config_dict(default_shape_class, default_state_class)
|
25 |
+
|
26 |
+
if nondefault_configfile_name and nondefault_configyaml_str:
|
27 |
+
raise ConfigError("Can load YAML from either string or file, not both.")
|
28 |
+
|
29 |
+
if nondefault_configfile_name:
|
30 |
+
nondefault_configfile = Path(nondefault_configfile_name)
|
31 |
+
try:
|
32 |
+
yamlstring = nondefault_configfile.read_text(encoding="utf-8")
|
33 |
+
except FileNotFoundError as err:
|
34 |
+
message = f"Config file '{nondefault_configfile_name}' not found."
|
35 |
+
raise ConfigError(message) from err
|
36 |
+
configdict_from_yamlstring = load_yaml_to_dict(yamlstring=yamlstring)
|
37 |
+
if configdict_from_yamlstring is not None:
|
38 |
+
config_dict.update(configdict_from_yamlstring)
|
39 |
+
config_dict = _add_extra_element_aliases(config_dict)
|
40 |
+
config_dict = _add_colons_to_prefixes_if_needed(config_dict)
|
41 |
+
return config_dict
|
42 |
+
|
43 |
+
if nondefault_configyaml_str:
|
44 |
+
configdict_from_yamlstring = load_yaml_to_dict(
|
45 |
+
yamlstring=nondefault_configyaml_str
|
46 |
+
)
|
47 |
+
if configdict_from_yamlstring is not None:
|
48 |
+
if not configdict_from_yamlstring.get("default_shape_identifier"):
|
49 |
+
configdict_from_yamlstring["default_shape_identifier"] = "default"
|
50 |
+
config_dict.update(configdict_from_yamlstring)
|
51 |
+
config_dict = _add_extra_element_aliases(config_dict)
|
52 |
+
config_dict = _add_colons_to_prefixes_if_needed(config_dict)
|
53 |
+
return config_dict
|
54 |
+
|
55 |
+
try:
|
56 |
+
yamlstring = Path(default_configfile_name).read_text(encoding="utf-8")
|
57 |
+
except FileNotFoundError:
|
58 |
+
yamlstring = default_configyaml_str
|
59 |
+
configdict_from_yamlstring = load_yaml_to_dict(yamlstring=yamlstring)
|
60 |
+
if configdict_from_yamlstring is not None:
|
61 |
+
config_dict.update(configdict_from_yamlstring)
|
62 |
+
config_dict = _add_extra_element_aliases(config_dict)
|
63 |
+
config_dict = _add_colons_to_prefixes_if_needed(config_dict)
|
64 |
+
return config_dict
|
65 |
+
|
66 |
+
|
67 |
+
def _add_extra_element_aliases(config_dict):
|
68 |
+
"""If extra element aliases are specified, add them to the configuration dict."""
|
69 |
+
extras = config_dict.get("extra_element_aliases")
|
70 |
+
if extras:
|
71 |
+
try:
|
72 |
+
extras = {coerce_concise(str(k).lower()): v for (k, v) in extras.items()}
|
73 |
+
except AttributeError:
|
74 |
+
extras = {}
|
75 |
+
config_dict["element_aliases"].update(extras)
|
76 |
+
return config_dict
|
77 |
+
|
78 |
+
|
79 |
+
def _add_colons_to_prefixes_if_needed(config_dict):
|
80 |
+
"""Reconstitute config_dict.prefixes, ensuring that each prefix ends in colon."""
|
81 |
+
prefixes = config_dict.get("prefixes")
|
82 |
+
new_prefixes = {}
|
83 |
+
if prefixes:
|
84 |
+
for prefix in prefixes:
|
85 |
+
if not prefix.endswith(":"):
|
86 |
+
new_prefixes[prefix + ":"] = prefixes[prefix]
|
87 |
+
else:
|
88 |
+
new_prefixes[prefix] = prefixes[prefix]
|
89 |
+
config_dict["prefixes"] = new_prefixes
|
90 |
+
return config_dict
|
91 |
+
|
92 |
+
|
93 |
+
def _get_shems(shape):
|
94 |
+
"""List TAP elements supported by given shape class."""
|
95 |
+
main_shems = list(asdict(shape()))
|
96 |
+
main_shems.remove("state_list")
|
97 |
+
main_shems.remove("shape_warns")
|
98 |
+
main_shems.remove("shape_extras")
|
99 |
+
return main_shems
|
100 |
+
|
101 |
+
|
102 |
+
def _get_stems(state):
|
103 |
+
"""List TAP elements supported by given statement template class."""
|
104 |
+
main_stems = list(asdict(state()))
|
105 |
+
main_stems.remove("state_warns")
|
106 |
+
main_stems.remove("state_extras")
|
107 |
+
return main_stems
|
108 |
+
|
109 |
+
|
110 |
+
def _initialize_config_dict(shapeclass, stateclass):
|
111 |
+
"""Initialize config dict with element lists (computed) and placeholder keys."""
|
112 |
+
config_dict = {}
|
113 |
+
ems_dict = {}
|
114 |
+
shems = ems_dict["shape_elements"] = _get_shems(shapeclass)
|
115 |
+
stems = ems_dict["statement_template_elements"] = _get_stems(stateclass)
|
116 |
+
ems_dict["csv_elements"] = shems + stems
|
117 |
+
config_dict["element_aliases"] = {}
|
118 |
+
config_dict["element_aliases"].update(_get_aliases_dict(ems_dict["csv_elements"]))
|
119 |
+
config_dict["default_shape_identifier"] = "default"
|
120 |
+
config_dict["prefixes"] = {}
|
121 |
+
config_dict["extra_shape_elements"] = []
|
122 |
+
config_dict["extra_statement_template_elements"] = []
|
123 |
+
config_dict["picklist_elements"] = []
|
124 |
+
config_dict["picklist_item_separator"] = " "
|
125 |
+
config_dict["extra_value_node_types"] = []
|
126 |
+
config_dict["extra_element_aliases"] = {}
|
127 |
+
config_dict.update(ems_dict)
|
128 |
+
return config_dict
|
129 |
+
|
130 |
+
|
131 |
+
def write_configfile(
|
132 |
+
nondefault_configyaml_str=None,
|
133 |
+
default_configfile_name=CONFIGFILE,
|
134 |
+
default_configyaml_str=CONFIGYAML,
|
135 |
+
):
|
136 |
+
"""Write initial config file or exit trying."""
|
137 |
+
if Path(default_configfile_name).exists():
|
138 |
+
raise ConfigError(f"'{default_configfile_name}' exists - will not overwrite.")
|
139 |
+
if nondefault_configyaml_str: # useful for testing
|
140 |
+
default_configyaml_str = nondefault_configyaml_str
|
141 |
+
try:
|
142 |
+
with open(default_configfile_name, "w", encoding="utf-8") as outfile:
|
143 |
+
outfile.write(default_configyaml_str)
|
144 |
+
print(f"Settings written to '{default_configfile_name}'.", file=sys.stderr)
|
145 |
+
except FileNotFoundError as error:
|
146 |
+
raise ConfigError(f"'{default_configfile_name}' not writeable.") from error
|
147 |
+
|
148 |
+
|
149 |
+
def _get_aliases_dict(csv_elements_list=None):
|
150 |
+
"""Short/lowerkey-to-element dict from CSV elements, minus privates."""
|
151 |
+
aliases_to_elements = {}
|
152 |
+
for csv_elem in csv_elements_list:
|
153 |
+
if csv_elem[0] != "_":
|
154 |
+
lowerkey = csv_elem.lower()
|
155 |
+
aliases_to_elements[lowerkey] = csv_elem # { foobar: fooBar }
|
156 |
+
return aliases_to_elements
|
dctap/csvreader.py
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Parse TAP, return two-item tuple: (list of shape objects, list of warnings)."""
|
2 |
+
|
3 |
+
import re
|
4 |
+
from collections import defaultdict
|
5 |
+
from csv import DictReader
|
6 |
+
from io import StringIO as StringBuffer
|
7 |
+
from dataclasses import asdict
|
8 |
+
from dctap.exceptions import DctapError, NoDataError
|
9 |
+
from dctap.tapclasses import TAPShape, TAPStatementTemplate
|
10 |
+
from dctap.utils import coerce_concise
|
11 |
+
|
12 |
+
|
13 |
+
def csvreader(
|
14 |
+
csvfile_str=None,
|
15 |
+
config_dict=None,
|
16 |
+
open_csvfile_obj=None,
|
17 |
+
shape_class=TAPShape,
|
18 |
+
state_class=TAPStatementTemplate,
|
19 |
+
):
|
20 |
+
"""From open CSV file object, return shapes dict."""
|
21 |
+
if csvfile_str:
|
22 |
+
(csvrows, csvwarns) = _get_rows(
|
23 |
+
csvfile_str=csvfile_str,
|
24 |
+
config_dict=config_dict,
|
25 |
+
)
|
26 |
+
elif open_csvfile_obj:
|
27 |
+
(csvrows, csvwarns) = _get_rows(
|
28 |
+
open_csvfile_obj=open_csvfile_obj,
|
29 |
+
config_dict=config_dict,
|
30 |
+
)
|
31 |
+
else:
|
32 |
+
raise DctapError("No data provided.")
|
33 |
+
|
34 |
+
(tapshapes, tapwarns) = _get_tapshapes(
|
35 |
+
rows=csvrows,
|
36 |
+
config_dict=config_dict,
|
37 |
+
shape_class=shape_class,
|
38 |
+
state_class=state_class,
|
39 |
+
)
|
40 |
+
tapwarns = {**csvwarns, **tapwarns}
|
41 |
+
prefixes_used = _get_prefixes_actually_used(csvrows)
|
42 |
+
tapshapes = _add_namespaces(tapshapes, config_dict, prefixes_used)
|
43 |
+
tapshapes = _add_tapwarns(tapshapes, tapwarns)
|
44 |
+
return tapshapes
|
45 |
+
|
46 |
+
|
47 |
+
def _add_namespaces(tapshapes=None, config_dict=None, prefixes_used=None):
|
48 |
+
"""Adds key 'namespaces' to tapshapes dict."""
|
49 |
+
tapshapes["namespaces"] = {}
|
50 |
+
if config_dict.get("prefixes"):
|
51 |
+
for prefix in prefixes_used:
|
52 |
+
if config_dict["prefixes"].get(prefix):
|
53 |
+
tapshapes["namespaces"][prefix] = config_dict["prefixes"].get(prefix)
|
54 |
+
return tapshapes
|
55 |
+
|
56 |
+
|
57 |
+
def _add_tapwarns(tapshapes=None, tapwarns=None):
|
58 |
+
"""Adds key 'warnings' to tapshapes dict."""
|
59 |
+
tapshapes["warnings"] = tapwarns
|
60 |
+
return tapshapes
|
61 |
+
|
62 |
+
|
63 |
+
def _get_prefixes_actually_used(csvrows):
|
64 |
+
"""List strings before colons in values of elements that could take URI prefixes."""
|
65 |
+
prefixes = set()
|
66 |
+
for row in csvrows:
|
67 |
+
for element in [
|
68 |
+
"shapeID",
|
69 |
+
"propertyID",
|
70 |
+
"valueDataType",
|
71 |
+
"valueShape",
|
72 |
+
]:
|
73 |
+
if row.get(element):
|
74 |
+
prefix_plus_uri_pair = re.match(r"([^:]*):", row.get(element))
|
75 |
+
if prefix_plus_uri_pair: # if there is at least one
|
76 |
+
prefix_as_provided = prefix_plus_uri_pair.group(0)
|
77 |
+
prefixes.add(prefix_as_provided)
|
78 |
+
if row.get("valueConstraint"):
|
79 |
+
pattern = r"\b\w+:"
|
80 |
+
used_in_valueconstraint = re.findall(pattern, row.get("valueConstraint"))
|
81 |
+
prefixes = set(list(prefixes) + list(used_in_valueconstraint))
|
82 |
+
return list(prefixes)
|
83 |
+
|
84 |
+
|
85 |
+
def _get_rows(
|
86 |
+
csvfile_str=None,
|
87 |
+
config_dict=None,
|
88 |
+
open_csvfile_obj=None,
|
89 |
+
):
|
90 |
+
"""Extract from _io.TextIOWrapper object a list of CSV file rows as dicts."""
|
91 |
+
# pylint: disable=too-many-locals
|
92 |
+
# pylint: disable=too-many-branches
|
93 |
+
if csvfile_str:
|
94 |
+
csvfile_contents_str = csvfile_str
|
95 |
+
elif open_csvfile_obj:
|
96 |
+
csvfile_contents_str = open_csvfile_obj.read()
|
97 |
+
else:
|
98 |
+
raise NoDataError("No data to process.")
|
99 |
+
|
100 |
+
tmp_buffer = StringBuffer(csvfile_contents_str)
|
101 |
+
csvlines_stripped = [line.strip() for line in tmp_buffer]
|
102 |
+
csvlines_stripped = [
|
103 |
+
line for line in csvlines_stripped if not re.match("#", line.strip())
|
104 |
+
]
|
105 |
+
if len(csvlines_stripped) < 2:
|
106 |
+
raise NoDataError("No data to process.")
|
107 |
+
raw_header_line_list = csvlines_stripped[0].split(",")
|
108 |
+
new_header_line_list = []
|
109 |
+
|
110 |
+
recognized_elements = config_dict.get("csv_elements")
|
111 |
+
xtra_shems = config_dict.get("extra_shape_elements")
|
112 |
+
xtra_stems = config_dict.get("extra_statement_template_elements")
|
113 |
+
if xtra_shems:
|
114 |
+
recognized_elements.extend(xtra_shems)
|
115 |
+
for element in xtra_shems:
|
116 |
+
config_dict["element_aliases"][element.lower()] = element
|
117 |
+
if xtra_stems:
|
118 |
+
recognized_elements.extend(xtra_stems)
|
119 |
+
for element in xtra_stems:
|
120 |
+
config_dict["element_aliases"][element.lower()] = element
|
121 |
+
recognized_elements = [elem.lower() for elem in recognized_elements]
|
122 |
+
|
123 |
+
for column in raw_header_line_list:
|
124 |
+
column = coerce_concise(column)
|
125 |
+
column = _normalize_element_name(column, config_dict.get("element_aliases"))
|
126 |
+
new_header_line_list.append(column)
|
127 |
+
|
128 |
+
csv_warns = defaultdict(dict)
|
129 |
+
for column in new_header_line_list:
|
130 |
+
if column.lower() not in recognized_elements:
|
131 |
+
warn = f"Non-DCTAP element '{column}' not configured as extra element."
|
132 |
+
csv_warns["csv"] = {}
|
133 |
+
csv_warns["csv"]["column"] = []
|
134 |
+
csv_warns["csv"]["column"].append(warn)
|
135 |
+
|
136 |
+
new_header_line_str = ",".join(new_header_line_list)
|
137 |
+
csvlines_stripped[0] = new_header_line_str
|
138 |
+
if not csvlines_stripped[0]:
|
139 |
+
raise NoDataError("No data to process.")
|
140 |
+
if "propertyID" not in csvlines_stripped[0]:
|
141 |
+
raise DctapError("Valid DCTAP CSV must have a 'propertyID' column.")
|
142 |
+
|
143 |
+
tmp_buffer2 = StringBuffer("".join([line + "\n" for line in csvlines_stripped]))
|
144 |
+
csv_rows = list(DictReader(tmp_buffer2))
|
145 |
+
for row in csv_rows:
|
146 |
+
for key, value in row.items():
|
147 |
+
if isinstance(value, str): # ignore if instance of NoneType or list
|
148 |
+
row[key] = value.strip()
|
149 |
+
csv_warns = dict(csv_warns)
|
150 |
+
return (csv_rows, csv_warns)
|
151 |
+
|
152 |
+
|
153 |
+
def _get_tapshapes(rows=None, config_dict=None, shape_class=None, state_class=None):
|
154 |
+
"""Return tuple: (shapes dict, warnings dict)."""
|
155 |
+
# pylint: disable=too-many-locals
|
156 |
+
# pylint: disable=too-many-branches
|
157 |
+
# pylint: disable=too-many-statements
|
158 |
+
|
159 |
+
default_shape_id = config_dict["default_shape_identifier"]
|
160 |
+
main_stems = config_dict.get("statement_template_elements")
|
161 |
+
xtra_stems = config_dict.get("extra_statement_template_elements")
|
162 |
+
shapes = {} # dict for shapeID-to-TAPShape_list
|
163 |
+
warns = defaultdict(dict) # dict for shapeID-to-warnings_list
|
164 |
+
|
165 |
+
for row in rows:
|
166 |
+
shape_id = ""
|
167 |
+
if row.get("propertyID"):
|
168 |
+
if row.get("shapeID"):
|
169 |
+
shape_id = row.get("shapeID")
|
170 |
+
elif not row.get("shapeID"):
|
171 |
+
try:
|
172 |
+
shape_id = list(shapes)[-1]
|
173 |
+
except IndexError:
|
174 |
+
shape_id = row["shapeID"] = default_shape_id
|
175 |
+
elif row.get("shapeID"):
|
176 |
+
shape_id = row.get("shapeID")
|
177 |
+
|
178 |
+
if shape_id:
|
179 |
+
if shape_id not in list(shapes):
|
180 |
+
shape_obj = _make_shape(
|
181 |
+
row_dict=row,
|
182 |
+
config_dict=config_dict,
|
183 |
+
shape_class=shape_class,
|
184 |
+
)
|
185 |
+
shape_obj.normalize(config_dict)
|
186 |
+
shapes[shape_id] = shape_obj
|
187 |
+
warns[shape_id] = {}
|
188 |
+
|
189 |
+
shape_warnings = shape_obj.get_warnings()
|
190 |
+
for (elem, warn) in shape_warnings.items():
|
191 |
+
try:
|
192 |
+
warns[shape_id][elem].append(warn)
|
193 |
+
except KeyError:
|
194 |
+
warns[shape_id][elem] = []
|
195 |
+
warns[shape_id][elem].append(warn)
|
196 |
+
|
197 |
+
if not row.get("propertyID"):
|
198 |
+
continue
|
199 |
+
|
200 |
+
state_class_obj = state_class()
|
201 |
+
for col in row:
|
202 |
+
if col in main_stems:
|
203 |
+
setattr(state_class_obj, col, row[col])
|
204 |
+
elif col in xtra_stems:
|
205 |
+
state_class_obj.state_extras[col] = row[col]
|
206 |
+
|
207 |
+
state_class_obj.normalize(config_dict)
|
208 |
+
shapes[shape_id].state_list.append(state_class_obj)
|
209 |
+
|
210 |
+
warns_dict = dict(warns)
|
211 |
+
shapes_dict = {}
|
212 |
+
shapes_dict["shapes"] = []
|
213 |
+
|
214 |
+
for shape_obj in list(shapes.values()):
|
215 |
+
sh_dict = asdict(shape_obj)
|
216 |
+
sh_dict["statement_templates"] = sh_dict.pop("state_list")
|
217 |
+
shapes_dict["shapes"].append(sh_dict)
|
218 |
+
|
219 |
+
shapes_dict = _simplify(shapes_dict)
|
220 |
+
|
221 |
+
return (shapes_dict, warns_dict)
|
222 |
+
|
223 |
+
|
224 |
+
def _make_shape(row_dict=None, config_dict=None, shape_class=None):
|
225 |
+
"""Populates shape fields of dataclass shape object from dict for one row.
|
226 |
+
|
227 |
+
Args:
|
228 |
+
row_dict: Dictionary of all columns headers (keys) and cell values (values)
|
229 |
+
found in a given row, with no distinction between shape elements and
|
230 |
+
statement template elements.
|
231 |
+
config_dict: Dictionary of settings, built-in or as read from config file.
|
232 |
+
|
233 |
+
Returns:
|
234 |
+
Unpopulated instance of shape class, for example:
|
235 |
+
TAPShape(shapeID='', state_list=[], shape_warns={}, state_extras={}, ...)
|
236 |
+
"""
|
237 |
+
main_shems = config_dict.get("shape_elements")
|
238 |
+
xtra_shems = config_dict.get("extra_shape_elements")
|
239 |
+
tapshape_obj = shape_class()
|
240 |
+
for key in row_dict:
|
241 |
+
if key in main_shems:
|
242 |
+
setattr(tapshape_obj, key, row_dict[key])
|
243 |
+
elif key in xtra_shems:
|
244 |
+
tapshape_obj.shape_extras[key] = row_dict[key]
|
245 |
+
return tapshape_obj
|
246 |
+
|
247 |
+
|
248 |
+
def _normalize_element_name(some_str, element_aliases_dict=None):
|
249 |
+
"""Given header string, return converted if aliased, else return unchanged."""
|
250 |
+
some_str = coerce_concise(some_str)
|
251 |
+
if element_aliases_dict:
|
252 |
+
for key in element_aliases_dict.keys():
|
253 |
+
if key == some_str:
|
254 |
+
some_str = element_aliases_dict[key]
|
255 |
+
return some_str
|
256 |
+
|
257 |
+
|
258 |
+
def _simplify(shapes_dict):
|
259 |
+
"""Remove elements from shapes dictionary with falsy values."""
|
260 |
+
for shape in shapes_dict["shapes"]:
|
261 |
+
for state in shape["statement_templates"]:
|
262 |
+
if state.get("state_extras"):
|
263 |
+
for (k, v) in state["state_extras"].items():
|
264 |
+
state[k] = v
|
265 |
+
del state["state_extras"]
|
266 |
+
if state.get("state_warns"):
|
267 |
+
del state["state_warns"]
|
268 |
+
for empty_element in [key for key in state if not state[key]]:
|
269 |
+
del state[empty_element]
|
270 |
+
if shape.get("shape_extras"):
|
271 |
+
for (k, v) in shape["shape_extras"].items():
|
272 |
+
shape[k] = v
|
273 |
+
del shape["shape_extras"]
|
274 |
+
if shape.get("shape_warns"):
|
275 |
+
del shape["shape_warns"]
|
276 |
+
for empty_element in [key for key in shape if not shape[key]]:
|
277 |
+
del shape[empty_element]
|
278 |
+
return shapes_dict
|
dctap/defaults.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Default settings."""
|
2 |
+
|
3 |
+
CONFIGFILE = "dctap.yaml"
|
4 |
+
CONFIGYAML = """\
|
5 |
+
# dctap configuration file (in YAML format)
|
6 |
+
# See https://dctap-python.readthedocs.io/en/latest/config/ for more options
|
7 |
+
|
8 |
+
prefixes:
|
9 |
+
":": "http://example.org/"
|
10 |
+
"dc:": "http://purl.org/dc/elements/1.1/"
|
11 |
+
"dcterms:": "http://purl.org/dc/terms/"
|
12 |
+
"dct:": "http://purl.org/dc/terms/"
|
13 |
+
"foaf:": "http://xmlns.com/foaf/0.1/"
|
14 |
+
"owl:": "http://www.w3.org/2002/07/owl#"
|
15 |
+
"rdf:": "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
16 |
+
"rdfs:": "http://www.w3.org/2000/01/rdf-schema#"
|
17 |
+
"schema:": "http://schema.org/"
|
18 |
+
"skos:": "http://www.w3.org/2004/02/skos/core#"
|
19 |
+
"skosxl:": "http://www.w3.org/2008/05/skos-xl#"
|
20 |
+
"wdt:": "http://www.wikidata.org/prop/direct/"
|
21 |
+
"xsd:": "http://www.w3.org/2001/XMLSchema#"
|
22 |
+
"""
|
dctap/exceptions.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Exception classes for dctap."""
|
2 |
+
|
3 |
+
|
4 |
+
class DctapError(SystemExit):
|
5 |
+
"""Exceptions related to Dctap generally."""
|
6 |
+
|
7 |
+
|
8 |
+
class ConfigError(DctapError):
|
9 |
+
"""Exceptions related to configuration."""
|
10 |
+
|
11 |
+
|
12 |
+
class NoDataError(DctapError):
|
13 |
+
"""Exception raised if there is no (TAP) data to process."""
|
dctap/inspect.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Print CSV contents and warnings."""
|
2 |
+
|
3 |
+
import sys
|
4 |
+
from dctap.loggers import stderr_logger
|
5 |
+
|
6 |
+
|
7 |
+
def pprint_tapshapes(tapshapes_dict=None, config_dict=None):
|
8 |
+
"""Pretty-print TAPShape objects to output list, ready for printing to console."""
|
9 |
+
# pylint: disable=too-many-branches
|
10 |
+
main_shems = config_dict.get("shape_elements")
|
11 |
+
xtra_shems = config_dict.get("extra_shape_elements")
|
12 |
+
main_stems = config_dict.get("statement_template_elements")
|
13 |
+
xtra_stems = config_dict.get("extra_statement_template_elements")
|
14 |
+
pprint_output = []
|
15 |
+
pprint_output.append("Tabular Application Profile (TAP)")
|
16 |
+
for tapshape_dict in tapshapes_dict.get("shapes"):
|
17 |
+
pprint_output.append(" Shape")
|
18 |
+
for key in main_shems:
|
19 |
+
indent08 = 8 * " " + key + " "
|
20 |
+
while len(indent08) < 33:
|
21 |
+
indent08 += " "
|
22 |
+
if tapshape_dict.get(key):
|
23 |
+
pprint_output.append(indent08 + str(tapshape_dict.get(key)))
|
24 |
+
for key in xtra_shems:
|
25 |
+
indent08 = 8 * " " + "[" + key + "] "
|
26 |
+
while len(indent08) < 33:
|
27 |
+
indent08 += " "
|
28 |
+
if tapshape_dict.get(key):
|
29 |
+
pprint_output.append(indent08 + str(tapshape_dict.get(key)))
|
30 |
+
|
31 |
+
for sc_dict in tapshape_dict.get("statement_templates"):
|
32 |
+
pprint_output.append(" Statement Template")
|
33 |
+
for key in main_stems:
|
34 |
+
if sc_dict.get(key):
|
35 |
+
indent12 = 12 * " " + key + " "
|
36 |
+
while len(indent12) < 33:
|
37 |
+
indent12 += " "
|
38 |
+
pprint_output.append(indent12 + str(sc_dict.get(key)))
|
39 |
+
for key in xtra_stems:
|
40 |
+
indent08 = 12 * " " + "[" + key + "] "
|
41 |
+
while len(indent08) < 33:
|
42 |
+
indent08 += " "
|
43 |
+
if sc_dict.get(key):
|
44 |
+
pprint_output.append(indent08 + str(sc_dict.get(key)))
|
45 |
+
|
46 |
+
return pprint_output
|
47 |
+
|
48 |
+
|
49 |
+
def print_warnings(warnings_dict):
|
50 |
+
"""Print warnings to stdout."""
|
51 |
+
# pylint: disable=logging-fstring-interpolation
|
52 |
+
print("", file=sys.stderr)
|
53 |
+
echo = stderr_logger()
|
54 |
+
for (shapeid, warns) in warnings_dict.items():
|
55 |
+
for (elem, warn_list) in warns.items():
|
56 |
+
for warning in warn_list:
|
57 |
+
echo.warning(f"[{shapeid}/{elem}] {warning}")
|
dctap/loggers.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Loggers: warnings to stderr, warnings to file, verbose debug info to file."""
|
2 |
+
|
3 |
+
import logging
|
4 |
+
import sys
|
5 |
+
|
6 |
+
PLAIN_FORMATTER = logging.Formatter("%(levelname)s %(message)s")
|
7 |
+
|
8 |
+
TIMESTAMP_FORMATTER = logging.Formatter(
|
9 |
+
"%(asctime)s %(levelname)s %(message)s", "%Y-%m-%d %H:%M"
|
10 |
+
)
|
11 |
+
|
12 |
+
|
13 |
+
def stderr_logger():
|
14 |
+
"""Initialize logger for printing messages, INFO and higher, to standard error."""
|
15 |
+
stderr_info_logger = logging.getLogger("stderr_logger")
|
16 |
+
stderr_info_logger.setLevel(logging.INFO)
|
17 |
+
stderr_info_logger_handler = logging.StreamHandler(sys.stderr)
|
18 |
+
stderr_info_logger_handler.setLevel(logging.INFO)
|
19 |
+
stderr_info_logger.addHandler(stderr_info_logger_handler)
|
20 |
+
stderr_info_logger_handler.setFormatter(PLAIN_FORMATTER)
|
21 |
+
return stderr_info_logger
|
22 |
+
|
23 |
+
|
24 |
+
def warning_logger(output_filename="warnings.log"):
|
25 |
+
"""Logger for printing time-stamped messages, WARNING and higher, to file."""
|
26 |
+
warningfile_logger = logging.getLogger("warningfile_logger")
|
27 |
+
warningfile_logger.setLevel(logging.INFO)
|
28 |
+
warningfile_logger_handler = logging.FileHandler(output_filename, mode="w")
|
29 |
+
warningfile_logger_handler.setLevel(logging.WARNING)
|
30 |
+
warningfile_logger.addHandler(warningfile_logger_handler)
|
31 |
+
warningfile_logger_handler.setFormatter(TIMESTAMP_FORMATTER)
|
32 |
+
return warningfile_logger
|
33 |
+
|
34 |
+
|
35 |
+
def debug_logger(output_filename="debug.log"):
|
36 |
+
"""Logger for printing time-stamped messages, DEBUG and higher, to file."""
|
37 |
+
debugfile_logger = logging.getLogger("debugfile_logger")
|
38 |
+
debugfile_logger.setLevel(logging.DEBUG)
|
39 |
+
debugfile_logger_handler = logging.FileHandler(output_filename, mode="w")
|
40 |
+
debugfile_logger_handler.setLevel(logging.DEBUG)
|
41 |
+
debugfile_logger.addHandler(debugfile_logger_handler)
|
42 |
+
debugfile_logger_handler.setFormatter(TIMESTAMP_FORMATTER)
|
43 |
+
return debug_logger
|
dctap/tapclasses.py
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Classes for Python objects derived from CSV files."""
|
2 |
+
|
3 |
+
import re
|
4 |
+
from dataclasses import dataclass, field
|
5 |
+
from typing import List
|
6 |
+
from dctap.utils import coerce_integer, coerce_numeric, looks_like_uri_or_curie
|
7 |
+
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class TAPStatementTemplate:
|
11 |
+
"""Instances hold TAP/CSV elements related to statement templates."""
|
12 |
+
|
13 |
+
# pylint: disable=too-many-instance-attributes # It's a dataclass, right?
|
14 |
+
# pylint: disable=invalid-name # for elements not named in snake case.
|
15 |
+
|
16 |
+
propertyID: str = ""
|
17 |
+
propertyLabel: str = ""
|
18 |
+
mandatory: str = ""
|
19 |
+
repeatable: str = ""
|
20 |
+
valueNodeType: str = ""
|
21 |
+
valueDataType: str = ""
|
22 |
+
valueConstraint: str = ""
|
23 |
+
valueConstraintType: str = ""
|
24 |
+
valueShape: str = ""
|
25 |
+
note: str = ""
|
26 |
+
state_warns: dict = field(default_factory=dict)
|
27 |
+
state_extras: dict = field(default_factory=dict)
|
28 |
+
|
29 |
+
def normalize(self, config_dict):
|
30 |
+
"""Normalizes specific fields."""
|
31 |
+
# pylint: disable=attribute-defined-outside-init
|
32 |
+
self._normalize_booleans()
|
33 |
+
self._valueConstraintType_pattern_warn_if_valueConstraint_not_valid_regex()
|
34 |
+
self._valueConstraintType_pattern_warn_if_used_with_value_shape()
|
35 |
+
self._valueConstraintType_iristem_parse()
|
36 |
+
self._valueConstraintType_iristem_warn_if_list_items_not_IRIs()
|
37 |
+
self._valueConstraintType_languageTag_parse(config_dict)
|
38 |
+
self._valueConstraintType_minmaxlength_warn_if_not_nonnegative_integer()
|
39 |
+
self._valueConstraintType_minmaxinclusive_parse()
|
40 |
+
self._valueConstraintType_minmaxinclusive_warn_if_value_not_numeric()
|
41 |
+
self._valueConstraintType_warn_if_used_without_valueConstraint()
|
42 |
+
self._valueDataType_warn_if_used_with_valueNodeType_IRI()
|
43 |
+
self._valueNodeType_warn_if_valueNodeType_literal_used_with_any_valueShape()
|
44 |
+
self._valueConstraintType_picklist_parse(config_dict)
|
45 |
+
self._valueNodeType_is_from_enumerated_list(config_dict)
|
46 |
+
self._parse_elements_configured_as_picklist_elements(config_dict)
|
47 |
+
return self
|
48 |
+
|
49 |
+
def _warn_if_value_not_urilike(self):
|
50 |
+
"""Warns when values of given elements do not look like URIs."""
|
51 |
+
elements_that_may_take_uris = ["propertyID", "valueDataType", "valueShape"]
|
52 |
+
for elem in elements_that_may_take_uris:
|
53 |
+
value = getattr(self, elem)
|
54 |
+
warning = f"Value '{value}' does not look like a URI."
|
55 |
+
if value:
|
56 |
+
if not looks_like_uri_or_curie(value):
|
57 |
+
self.state_warns[elem] = warning
|
58 |
+
return self
|
59 |
+
|
60 |
+
def _normalize_booleans(self):
|
61 |
+
"""Coerces supported Boolean values to 'true' or 'false' or leaves unchanged."""
|
62 |
+
valid_values_for_true = ["true", "TRUE", "True", "1"]
|
63 |
+
valid_values_for_false = ["false", "FALSE", "False", "0"]
|
64 |
+
boolean_elements = ["mandatory", "repeatable"]
|
65 |
+
for elem in boolean_elements:
|
66 |
+
value = getattr(self, elem)
|
67 |
+
if value:
|
68 |
+
warning_message = f"'{value}' is not a supported Boolean value."
|
69 |
+
if value in valid_values_for_true:
|
70 |
+
setattr(self, elem, "true")
|
71 |
+
elif value in valid_values_for_false:
|
72 |
+
setattr(self, elem, "false")
|
73 |
+
else:
|
74 |
+
self.state_warns[elem] = warning_message
|
75 |
+
return self
|
76 |
+
|
77 |
+
def _valueConstraintType_iristem_parse(self):
|
78 |
+
"""If valueConstraintType is Iristem, split valueConstraint on whitespace."""
|
79 |
+
self.valueConstraintType = self.valueConstraintType.lower()
|
80 |
+
if self.valueConstraintType == "iristem":
|
81 |
+
if self.valueConstraint:
|
82 |
+
self.valueConstraint = self.valueConstraint.split()
|
83 |
+
return self
|
84 |
+
|
85 |
+
def _valueConstraintType_iristem_warn_if_list_items_not_IRIs(self):
|
86 |
+
"""If IRIStem, warn if valueConstraint list items do not look like IRIs."""
|
87 |
+
self.valueConstraintType = self.valueConstraintType.lower()
|
88 |
+
if self.valueConstraintType == "iristem":
|
89 |
+
for list_item in self.valueConstraint:
|
90 |
+
if not looks_like_uri_or_curie(list_item):
|
91 |
+
self.state_warns["valueConstraint"] = (
|
92 |
+
f"Value constraint type is '{self.valueConstraintType}', "
|
93 |
+
f"but '{list_item}' does not look like an IRI or "
|
94 |
+
"Compact IRI."
|
95 |
+
)
|
96 |
+
return self
|
97 |
+
|
98 |
+
def _valueConstraintType_minmaxlength_warn_if_not_nonnegative_integer(self):
|
99 |
+
"""
|
100 |
+
Tries to coerce valueConstraint to integer (or leaves string untouched).
|
101 |
+
Warns if valueConstraint for minLength is not a nonnegative integer.
|
102 |
+
"""
|
103 |
+
vctype = self.valueConstraintType.lower()
|
104 |
+
vc = self.valueConstraint = coerce_integer(self.valueConstraint)
|
105 |
+
bad_vc_warning = (
|
106 |
+
f"Value constraint type is '{self.valueConstraintType}', "
|
107 |
+
f"but '{self.valueConstraint}' is not a positive integer."
|
108 |
+
)
|
109 |
+
if vctype in ("minlength", "maxlength"):
|
110 |
+
if isinstance(vc, int):
|
111 |
+
if vc < 0:
|
112 |
+
self.state_warns["valueConstraint"] = bad_vc_warning
|
113 |
+
elif not isinstance(vc, int):
|
114 |
+
self.state_warns["valueConstraint"] = bad_vc_warning
|
115 |
+
|
116 |
+
return self
|
117 |
+
|
118 |
+
def _valueConstraintType_minmaxinclusive_parse(self):
|
119 |
+
"""
|
120 |
+
If value of valueConstraintType is 'minInclusive' or 'maxInclusive',
|
121 |
+
value of valueConstraint should be numeric (int or float).
|
122 |
+
"""
|
123 |
+
self.valueConstraintType = self.valueConstraintType.lower()
|
124 |
+
value_constraint = self.valueConstraint
|
125 |
+
if self.valueConstraintType in ("mininclusive", "maxinclusive"):
|
126 |
+
if value_constraint:
|
127 |
+
self.valueConstraint = coerce_numeric(value_constraint)
|
128 |
+
return self
|
129 |
+
|
130 |
+
def _valueConstraintType_minmaxinclusive_warn_if_value_not_numeric(self):
|
131 |
+
"""Warns if valueConstraint for minInclusive not coercable to float."""
|
132 |
+
self.valueConstraintType = self.valueConstraintType.lower()
|
133 |
+
if self.valueConstraintType in ("mininclusive", "maxinclusive"):
|
134 |
+
try:
|
135 |
+
float(self.valueConstraint)
|
136 |
+
except (ValueError, TypeError):
|
137 |
+
self.state_warns["valueConstraint"] = (
|
138 |
+
f"Value constraint type is '{self.valueConstraintType}', "
|
139 |
+
f"but '{self.valueConstraint}' is not numeric."
|
140 |
+
)
|
141 |
+
return self
|
142 |
+
|
143 |
+
def _valueConstraintType_pattern_warn_if_valueConstraint_not_valid_regex(self):
|
144 |
+
"""If valueConstraintType Pattern, warn if valueConstraint not valid regex."""
|
145 |
+
self.valueConstraintType = self.valueConstraintType.lower()
|
146 |
+
if self.valueConstraintType == "pattern":
|
147 |
+
try:
|
148 |
+
re.compile(self.valueConstraint)
|
149 |
+
except (re.error, TypeError):
|
150 |
+
self.state_warns["valueConstraint"] = (
|
151 |
+
f"Value constraint type is '{self.valueConstraintType}', but "
|
152 |
+
f"'{self.valueConstraint}' is not a valid regular expression."
|
153 |
+
)
|
154 |
+
return self
|
155 |
+
|
156 |
+
def _valueConstraintType_pattern_warn_if_used_with_value_shape(self):
|
157 |
+
"""Regular expressions cannot conform to value shapes."""
|
158 |
+
self.valueConstraintType = self.valueConstraintType.lower()
|
159 |
+
if self.valueConstraintType == "pattern":
|
160 |
+
if self.valueShape:
|
161 |
+
self.state_warns["valueConstraintType"] = (
|
162 |
+
f"Values of constraint type '{self.valueConstraintType}' "
|
163 |
+
"cannot conform to a value shape."
|
164 |
+
)
|
165 |
+
|
166 |
+
def _valueConstraintType_languageTag_parse(self, config_dict):
|
167 |
+
"""For valueConstraintType languageTag, splits valueConstraint on whitespace."""
|
168 |
+
self.valueConstraintType = self.valueConstraintType.lower()
|
169 |
+
sep = config_dict.get("picklist_item_separator", " ")
|
170 |
+
if self.valueConstraintType == "languagetag":
|
171 |
+
if self.valueConstraint:
|
172 |
+
self.valueConstraint = self.valueConstraint.split(sep)
|
173 |
+
self.valueConstraint = [x.strip() for x in self.valueConstraint if x]
|
174 |
+
return self
|
175 |
+
|
176 |
+
def _valueConstraintType_warn_if_used_without_valueConstraint(self):
|
177 |
+
"""Warns if valueConstraintType used without valueConstraint."""
|
178 |
+
if self.valueConstraintType:
|
179 |
+
if not self.valueConstraint:
|
180 |
+
self.state_warns["valueConstraint"] = (
|
181 |
+
f"Value constraint type '{self.valueConstraintType}' "
|
182 |
+
"has no corresponding value constraint."
|
183 |
+
)
|
184 |
+
return self
|
185 |
+
|
186 |
+
def _valueConstraintType_picklist_parse(self, config_dict):
|
187 |
+
"""If valueConstraintType is Picklist, split valueConstraint on whitespace."""
|
188 |
+
self.valueConstraintType = self.valueConstraintType.lower()
|
189 |
+
sep = config_dict.get("picklist_item_separator", " ")
|
190 |
+
if self.valueConstraintType == "picklist":
|
191 |
+
if self.valueConstraint:
|
192 |
+
self.valueConstraint = self.valueConstraint.split(sep)
|
193 |
+
self.valueConstraint = [x.strip() for x in self.valueConstraint if x]
|
194 |
+
return self
|
195 |
+
|
196 |
+
def _valueNodeType_is_from_enumerated_list(self, config_dict):
|
197 |
+
"""Take valueNodeType from configurable enumerated list, case-insensitive."""
|
198 |
+
warning = f"'{self.valueNodeType}' is not a valid node type."
|
199 |
+
valid_types = ["iri", "bnode", "literal"]
|
200 |
+
# This should be moved out to defaults dictionary.
|
201 |
+
if config_dict.get("extra_value_node_types"):
|
202 |
+
valid_types += [v.lower() for v in config_dict["extra_value_node_types"]]
|
203 |
+
if self.valueNodeType:
|
204 |
+
self.valueNodeType = self.valueNodeType.lower()
|
205 |
+
if self.valueNodeType not in valid_types:
|
206 |
+
self.state_warns["valueNodeType"] = warning
|
207 |
+
return self
|
208 |
+
|
209 |
+
def _valueNodeType_warn_if_valueNodeType_literal_used_with_any_valueShape(self):
|
210 |
+
"""Value with node type Literal cannot conform to a value shape."""
|
211 |
+
warning = "Values of node type 'literal' cannot conform to value shapes."
|
212 |
+
self.valueNodeType = self.valueNodeType.lower()
|
213 |
+
if self.valueShape:
|
214 |
+
if self.valueNodeType == "literal":
|
215 |
+
self.state_warns["valueDataType"] = warning
|
216 |
+
return self
|
217 |
+
|
218 |
+
def _valueDataType_warn_if_used_with_valueShape(self):
|
219 |
+
"""Value with any datatype cannot conform to a value shape."""
|
220 |
+
warning = "Values with datatypes (literals) cannot conform to value shapes."
|
221 |
+
if self.valueShape:
|
222 |
+
if self.valueDataType:
|
223 |
+
self.state_warns["valueDataType"] = warning
|
224 |
+
return self
|
225 |
+
|
226 |
+
def _valueDataType_warn_if_used_with_valueNodeType_IRI(self):
|
227 |
+
"""Value with datatype implies Literal and cannot be node type IRI."""
|
228 |
+
node_type = self.valueNodeType
|
229 |
+
data_type = self.valueDataType
|
230 |
+
warning = f"Datatype '{data_type}' incompatible with node type '{node_type}'."
|
231 |
+
node_type = self.valueNodeType.lower()
|
232 |
+
if node_type in ("iri", "uri", "bnode"):
|
233 |
+
if self.valueDataType:
|
234 |
+
self.state_warns["valueDataType"] = warning
|
235 |
+
return self
|
236 |
+
|
237 |
+
def _parse_elements_configured_as_picklist_elements(self, config_dict):
|
238 |
+
"""Parse elements configured as list elementss."""
|
239 |
+
if config_dict.get("picklist_item_separator"):
|
240 |
+
separator = config_dict.get("picklist_item_separator")
|
241 |
+
else:
|
242 |
+
separator = " "
|
243 |
+
|
244 |
+
if config_dict.get("picklist_elements"):
|
245 |
+
picklist_elements = config_dict.get("picklist_elements")
|
246 |
+
else:
|
247 |
+
picklist_elements = []
|
248 |
+
|
249 |
+
for element in picklist_elements:
|
250 |
+
if getattr(self, element):
|
251 |
+
setattr(self, element, getattr(self, element).split(separator))
|
252 |
+
|
253 |
+
return self
|
254 |
+
|
255 |
+
def get_warnings(self):
|
256 |
+
"""Emit self.state_warns as populated by self.normalize()."""
|
257 |
+
return dict(self.state_warns)
|
258 |
+
|
259 |
+
|
260 |
+
@dataclass
|
261 |
+
class TAPShape:
|
262 |
+
"""An instance holds TAP/CSV row elements related to one given, named shape."""
|
263 |
+
|
264 |
+
# pylint: disable=invalid-name
|
265 |
+
# True that propertyID, etc, do not conform to snake-case naming style.
|
266 |
+
|
267 |
+
shapeID: str = ""
|
268 |
+
shapeLabel: str = ""
|
269 |
+
state_list: List[TAPStatementTemplate] = field(default_factory=list)
|
270 |
+
shape_warns: dict = field(default_factory=dict)
|
271 |
+
shape_extras: dict = field(default_factory=dict)
|
272 |
+
|
273 |
+
def normalize(self, config_dict):
|
274 |
+
"""Normalize values where required."""
|
275 |
+
self._normalize_default_shapeID(config_dict)
|
276 |
+
self._warn_if_value_not_urilike()
|
277 |
+
return self
|
278 |
+
|
279 |
+
def _normalize_default_shapeID(self, config_dict):
|
280 |
+
"""If shapeID not specified, looks first in config, else sets "default"."""
|
281 |
+
if not self.shapeID:
|
282 |
+
self.shapeID = config_dict.get("default_shape_identifier", "default")
|
283 |
+
return self
|
284 |
+
|
285 |
+
def _warn_if_value_not_urilike(self):
|
286 |
+
"""Warns when values of given elements do not look like URIs."""
|
287 |
+
elements_that_may_take_uris = ["shapeID"]
|
288 |
+
for elem in elements_that_may_take_uris:
|
289 |
+
value = getattr(self, elem)
|
290 |
+
warning = f"Value '{value}' does not look like a URI."
|
291 |
+
if value:
|
292 |
+
if not looks_like_uri_or_curie(value):
|
293 |
+
self.shape_warns[elem] = warning
|
294 |
+
return self
|
295 |
+
|
296 |
+
def get_warnings(self):
|
297 |
+
"""Emit warnings dictionary self.shape_warns, populated by normalize() method."""
|
298 |
+
return dict(self.shape_warns)
|
dctap/utils.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utilities."""
|
2 |
+
|
3 |
+
import sys
|
4 |
+
import re
|
5 |
+
from pathlib import Path
|
6 |
+
from urllib.parse import urlparse
|
7 |
+
from ruamel.yaml import YAML, YAMLError
|
8 |
+
from ruamel.yaml.scanner import ScannerError
|
9 |
+
from ruamel.yaml.parser import ParserError
|
10 |
+
from dctap.exceptions import ConfigError, DctapError
|
11 |
+
|
12 |
+
|
13 |
+
def load_yaml_to_dict(yamlstring=None, yamlfile=None):
|
14 |
+
"""Convert YAML from string or file (filename or Path object) into Python dict."""
|
15 |
+
dict_from_yamlstring = {}
|
16 |
+
if yamlfile and yamlstring:
|
17 |
+
raise DctapError("Can load YAML from string or file, but not both.")
|
18 |
+
|
19 |
+
if yamlfile:
|
20 |
+
try:
|
21 |
+
yamlstring = Path(yamlfile).read_text(encoding="UTF-8")
|
22 |
+
except FileNotFoundError:
|
23 |
+
print(f"File '{yamlfile}' not found.", file=sys.stderr)
|
24 |
+
|
25 |
+
if yamlstring is not None:
|
26 |
+
yaml = YAML(typ="safe", pure=True)
|
27 |
+
try:
|
28 |
+
dict_from_yamlstring = yaml.load(yamlstring)
|
29 |
+
except (YAMLError, ScannerError, ParserError):
|
30 |
+
dict_from_yamlstring = None
|
31 |
+
if yamlfile:
|
32 |
+
print(f"YAML in '{yamlfile}' is badly formed.", file=sys.stderr)
|
33 |
+
else:
|
34 |
+
print("YAML is badly formed.", file=sys.stderr)
|
35 |
+
|
36 |
+
return dict_from_yamlstring
|
37 |
+
|
38 |
+
|
39 |
+
def coerce_concise(some_str=None):
|
40 |
+
"""
|
41 |
+
For given string:
|
42 |
+
- delete spaces, underscores, dashes, commas
|
43 |
+
- lowercase
|
44 |
+
- delete surrounding single and double quotes
|
45 |
+
"""
|
46 |
+
some_str = some_str.replace(" ", "")
|
47 |
+
some_str = some_str.replace("_", "")
|
48 |
+
some_str = some_str.replace("-", "")
|
49 |
+
some_str = some_str.replace(",", "")
|
50 |
+
some_str = some_str.lower()
|
51 |
+
some_str = some_str.strip('"')
|
52 |
+
some_str = some_str.strip("'")
|
53 |
+
return some_str
|
54 |
+
|
55 |
+
|
56 |
+
def coerce_integer(value_constraint=None):
|
57 |
+
"""Coerces string to integer or returns string untouched."""
|
58 |
+
try:
|
59 |
+
value_constraint = int(value_constraint)
|
60 |
+
except (ValueError, TypeError):
|
61 |
+
pass # pass the valueConstraint through untouched
|
62 |
+
return value_constraint
|
63 |
+
|
64 |
+
|
65 |
+
def coerce_numeric(value_constraint=None):
|
66 |
+
"""Coerces string to numeric type or returns string untouched."""
|
67 |
+
try:
|
68 |
+
if value_constraint == str(float(value_constraint)):
|
69 |
+
value_constraint = float(value_constraint)
|
70 |
+
else:
|
71 |
+
value_constraint = int(value_constraint)
|
72 |
+
except (ValueError, TypeError):
|
73 |
+
pass # pass the valueConstraint through untouched
|
74 |
+
return value_constraint
|
75 |
+
|
76 |
+
|
77 |
+
def expand_uri_prefixes(shapes_dict=None, config_dict=None):
|
78 |
+
"""Expand namespace prefixes, eg: dc:date to http://purl.org/dc/terms/date."""
|
79 |
+
# pylint: disable=too-many-nested-blocks
|
80 |
+
if not config_dict.get("prefixes"):
|
81 |
+
raise ConfigError("No 'prefixes' section found in config file.")
|
82 |
+
for shape in shapes_dict["shapes"]:
|
83 |
+
for prefix in config_dict["prefixes"]:
|
84 |
+
if re.match(prefix, shape["shapeID"]):
|
85 |
+
prefix_expanded = config_dict["prefixes"][prefix]
|
86 |
+
shape["shapeID"] = re.sub(prefix, prefix_expanded, shape["shapeID"])
|
87 |
+
for sc in shape["statement_templates"]:
|
88 |
+
for element in ["propertyID", "valueDataType", "valueShape"]:
|
89 |
+
if sc.get(element):
|
90 |
+
for prefix in config_dict["prefixes"]:
|
91 |
+
if re.match(prefix, sc.get(element)):
|
92 |
+
prefix_expanded = config_dict["prefixes"][prefix]
|
93 |
+
sc[element] = re.sub(prefix, prefix_expanded, sc[element])
|
94 |
+
return shapes_dict
|
95 |
+
|
96 |
+
|
97 |
+
def looks_like_uri_or_curie(url_string):
|
98 |
+
"""True if string superficially looks like a URI or Compact URI."""
|
99 |
+
if not isinstance(url_string, str):
|
100 |
+
return False
|
101 |
+
|
102 |
+
url_parsed = urlparse(url_string)
|
103 |
+
has_prefix = bool(url_parsed.scheme) # could be URI scheme or CURIE prefix
|
104 |
+
has_net_location = bool(url_parsed.netloc) # something with a dot
|
105 |
+
has_name = bool(re.match(r"^:", url_parsed.path)) # starts with a colon
|
106 |
+
|
107 |
+
if has_prefix and has_net_location:
|
108 |
+
return True
|
109 |
+
if has_prefix:
|
110 |
+
return True
|
111 |
+
if has_name:
|
112 |
+
return True
|
113 |
+
return False
|