rimpo commited on
Commit
28c59d9
1 Parent(s): 78f65de

feat: dctap-python files

Browse files
dctap/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Convert a Tabular Application Profile from CSV into JSON."""
2
+
3
+ from dctap.tapclasses import TAPShape, TAPStatementTemplate
4
+ from dctap.csvreader import csvreader
5
+
6
+ __version__ = "0.4.5"
7
+
8
+ # Keep version number in sync with:
9
+ # - https://github.com/dcmi/dctap-python/blob/main/docs/conf.py#L28
10
+ # ../docs/conf.py
11
+ # - https://github.com/dcmi/dctap-python/blob/main/dctap/cli.py#L20
12
+ # ../dctap/cli.py
dctap/cli.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DC Tabular Application Profiles (DCTAP) command-line utility."""
2
+
3
+ import sys
4
+ import json as j
5
+ from ruamel.yaml import YAML
6
+ import click
7
+ from dctap.defaults import CONFIGFILE
8
+ from dctap.config import get_config, write_configfile
9
+ from dctap.csvreader import csvreader
10
+ from dctap.inspect import pprint_tapshapes, print_warnings
11
+ from dctap.loggers import stderr_logger
12
+ from dctap.utils import expand_uri_prefixes
13
+
14
+ # pylint: disable=unused-argument,no-value-for-parameter
15
+ # => unused-argument: Allows placeholders for now.
16
+ # => no-value-for-parameter: Okay in cli.py
17
+
18
+
19
+ @click.group()
20
+ @click.version_option("0.4.5", help="Show version and exit")
21
+ @click.help_option(help="Show help and exit")
22
+ @click.pass_context
23
+ def cli(context):
24
+ """DC Tabular Application Profiles parser and base module
25
+
26
+ Examples (see https://dctap-python.rtfd.io):
27
+
28
+ \b
29
+ Write starter config file:
30
+ $ dctap init # Write dctap.yaml
31
+ \b
32
+ Show normalized view of TAP:
33
+ $ dctap read x.csv # Output as plain text
34
+ $ dctap read --json x.csv # Output as JSON
35
+ $ dctap read --yaml x.csv # Output as YAML
36
+ $ dctap read --warnings x.csv # Also show warnings
37
+ """
38
+
39
+
40
+ @cli.command()
41
+ @click.help_option(help="Show help and exit")
42
+ @click.pass_context
43
+ def init(context, hidden):
44
+ """Write config file: 'dctap.yaml'."""
45
+ configfile = CONFIGFILE
46
+ write_configfile(configfile)
47
+
48
+
49
+ @cli.command()
50
+ @click.argument("csvfile_obj", type=click.File(mode="r", encoding="utf-8-sig"))
51
+ @click.option("--config", type=click.Path(exists=True), help="Alternative config file")
52
+ @click.option("--expand-prefixes", is_flag=True, help="Expand compact IRIs")
53
+ @click.option("--warnings", is_flag=True, help="Print warnings to stderr")
54
+ @click.option("--json", is_flag=True, help="Print JSON to stdout")
55
+ @click.option("--yaml", is_flag=True, help="Print YAML to stdout")
56
+ @click.help_option(help="Show help and exit")
57
+ @click.pass_context
58
+ def read(context, csvfile_obj, config, expand_prefixes, warnings, json, yaml):
59
+ """Show TAP as TXT, JSON, or YAML."""
60
+ # pylint: disable=too-many-locals,too-many-arguments
61
+
62
+ if config:
63
+ config_dict = get_config(nondefault_configfile_name=config)
64
+ else:
65
+ config_dict = get_config()
66
+ tapshapes_dict = csvreader(open_csvfile_obj=csvfile_obj, config_dict=config_dict)
67
+
68
+ if expand_prefixes:
69
+ tapshapes_dict = expand_uri_prefixes(tapshapes_dict, config_dict)
70
+
71
+ if json and yaml:
72
+ # Quick fix for mutually exclusive options, a better fix in future.
73
+ echo = stderr_logger()
74
+ echo.warning("Please use either --json or --yaml")
75
+ click.Context.exit(0)
76
+
77
+ if json:
78
+ if not warnings:
79
+ del tapshapes_dict["warnings"]
80
+ json_output = j.dumps(tapshapes_dict, indent=2)
81
+ print(json_output)
82
+
83
+ if yaml:
84
+ if not warnings:
85
+ del tapshapes_dict["warnings"]
86
+ y = YAML()
87
+ y.indent(mapping=2, sequence=4, offset=2)
88
+ y.dump(tapshapes_dict, sys.stdout)
89
+
90
+ if not (json or yaml):
91
+ pprint_output = pprint_tapshapes(
92
+ tapshapes_dict=tapshapes_dict, config_dict=config_dict
93
+ )
94
+ for line in pprint_output:
95
+ print(line, file=sys.stdout)
96
+ if warnings:
97
+ print_warnings(tapshapes_dict["warnings"])
dctap/config.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Default settings."""
2
+
3
+ import sys
4
+ from dataclasses import asdict
5
+ from pathlib import Path
6
+ from dctap.defaults import CONFIGFILE, CONFIGYAML
7
+ from dctap.exceptions import ConfigError
8
+ from dctap.tapclasses import TAPShape, TAPStatementTemplate
9
+ from dctap.utils import load_yaml_to_dict, coerce_concise
10
+
11
+
12
+ def get_config(
13
+ nondefault_configyaml_str=None,
14
+ nondefault_configfile_name=None,
15
+ default_configyaml_str=CONFIGYAML,
16
+ default_configfile_name=CONFIGFILE,
17
+ default_shape_class=TAPShape,
18
+ default_state_class=TAPStatementTemplate,
19
+ ):
20
+ """Get configuration dictionary from package defaults (or from non-defaults)."""
21
+ # pylint: disable=too-many-arguments
22
+ yamlstring = None
23
+ configdict_from_yamlstring = None
24
+ config_dict = _initialize_config_dict(default_shape_class, default_state_class)
25
+
26
+ if nondefault_configfile_name and nondefault_configyaml_str:
27
+ raise ConfigError("Can load YAML from either string or file, not both.")
28
+
29
+ if nondefault_configfile_name:
30
+ nondefault_configfile = Path(nondefault_configfile_name)
31
+ try:
32
+ yamlstring = nondefault_configfile.read_text(encoding="utf-8")
33
+ except FileNotFoundError as err:
34
+ message = f"Config file '{nondefault_configfile_name}' not found."
35
+ raise ConfigError(message) from err
36
+ configdict_from_yamlstring = load_yaml_to_dict(yamlstring=yamlstring)
37
+ if configdict_from_yamlstring is not None:
38
+ config_dict.update(configdict_from_yamlstring)
39
+ config_dict = _add_extra_element_aliases(config_dict)
40
+ config_dict = _add_colons_to_prefixes_if_needed(config_dict)
41
+ return config_dict
42
+
43
+ if nondefault_configyaml_str:
44
+ configdict_from_yamlstring = load_yaml_to_dict(
45
+ yamlstring=nondefault_configyaml_str
46
+ )
47
+ if configdict_from_yamlstring is not None:
48
+ if not configdict_from_yamlstring.get("default_shape_identifier"):
49
+ configdict_from_yamlstring["default_shape_identifier"] = "default"
50
+ config_dict.update(configdict_from_yamlstring)
51
+ config_dict = _add_extra_element_aliases(config_dict)
52
+ config_dict = _add_colons_to_prefixes_if_needed(config_dict)
53
+ return config_dict
54
+
55
+ try:
56
+ yamlstring = Path(default_configfile_name).read_text(encoding="utf-8")
57
+ except FileNotFoundError:
58
+ yamlstring = default_configyaml_str
59
+ configdict_from_yamlstring = load_yaml_to_dict(yamlstring=yamlstring)
60
+ if configdict_from_yamlstring is not None:
61
+ config_dict.update(configdict_from_yamlstring)
62
+ config_dict = _add_extra_element_aliases(config_dict)
63
+ config_dict = _add_colons_to_prefixes_if_needed(config_dict)
64
+ return config_dict
65
+
66
+
67
+ def _add_extra_element_aliases(config_dict):
68
+ """If extra element aliases are specified, add them to the configuration dict."""
69
+ extras = config_dict.get("extra_element_aliases")
70
+ if extras:
71
+ try:
72
+ extras = {coerce_concise(str(k).lower()): v for (k, v) in extras.items()}
73
+ except AttributeError:
74
+ extras = {}
75
+ config_dict["element_aliases"].update(extras)
76
+ return config_dict
77
+
78
+
79
+ def _add_colons_to_prefixes_if_needed(config_dict):
80
+ """Reconstitute config_dict.prefixes, ensuring that each prefix ends in colon."""
81
+ prefixes = config_dict.get("prefixes")
82
+ new_prefixes = {}
83
+ if prefixes:
84
+ for prefix in prefixes:
85
+ if not prefix.endswith(":"):
86
+ new_prefixes[prefix + ":"] = prefixes[prefix]
87
+ else:
88
+ new_prefixes[prefix] = prefixes[prefix]
89
+ config_dict["prefixes"] = new_prefixes
90
+ return config_dict
91
+
92
+
93
+ def _get_shems(shape):
94
+ """List TAP elements supported by given shape class."""
95
+ main_shems = list(asdict(shape()))
96
+ main_shems.remove("state_list")
97
+ main_shems.remove("shape_warns")
98
+ main_shems.remove("shape_extras")
99
+ return main_shems
100
+
101
+
102
+ def _get_stems(state):
103
+ """List TAP elements supported by given statement template class."""
104
+ main_stems = list(asdict(state()))
105
+ main_stems.remove("state_warns")
106
+ main_stems.remove("state_extras")
107
+ return main_stems
108
+
109
+
110
+ def _initialize_config_dict(shapeclass, stateclass):
111
+ """Initialize config dict with element lists (computed) and placeholder keys."""
112
+ config_dict = {}
113
+ ems_dict = {}
114
+ shems = ems_dict["shape_elements"] = _get_shems(shapeclass)
115
+ stems = ems_dict["statement_template_elements"] = _get_stems(stateclass)
116
+ ems_dict["csv_elements"] = shems + stems
117
+ config_dict["element_aliases"] = {}
118
+ config_dict["element_aliases"].update(_get_aliases_dict(ems_dict["csv_elements"]))
119
+ config_dict["default_shape_identifier"] = "default"
120
+ config_dict["prefixes"] = {}
121
+ config_dict["extra_shape_elements"] = []
122
+ config_dict["extra_statement_template_elements"] = []
123
+ config_dict["picklist_elements"] = []
124
+ config_dict["picklist_item_separator"] = " "
125
+ config_dict["extra_value_node_types"] = []
126
+ config_dict["extra_element_aliases"] = {}
127
+ config_dict.update(ems_dict)
128
+ return config_dict
129
+
130
+
131
+ def write_configfile(
132
+ nondefault_configyaml_str=None,
133
+ default_configfile_name=CONFIGFILE,
134
+ default_configyaml_str=CONFIGYAML,
135
+ ):
136
+ """Write initial config file or exit trying."""
137
+ if Path(default_configfile_name).exists():
138
+ raise ConfigError(f"'{default_configfile_name}' exists - will not overwrite.")
139
+ if nondefault_configyaml_str: # useful for testing
140
+ default_configyaml_str = nondefault_configyaml_str
141
+ try:
142
+ with open(default_configfile_name, "w", encoding="utf-8") as outfile:
143
+ outfile.write(default_configyaml_str)
144
+ print(f"Settings written to '{default_configfile_name}'.", file=sys.stderr)
145
+ except FileNotFoundError as error:
146
+ raise ConfigError(f"'{default_configfile_name}' not writeable.") from error
147
+
148
+
149
+ def _get_aliases_dict(csv_elements_list=None):
150
+ """Short/lowerkey-to-element dict from CSV elements, minus privates."""
151
+ aliases_to_elements = {}
152
+ for csv_elem in csv_elements_list:
153
+ if csv_elem[0] != "_":
154
+ lowerkey = csv_elem.lower()
155
+ aliases_to_elements[lowerkey] = csv_elem # { foobar: fooBar }
156
+ return aliases_to_elements
dctap/csvreader.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Parse TAP, return two-item tuple: (list of shape objects, list of warnings)."""
2
+
3
+ import re
4
+ from collections import defaultdict
5
+ from csv import DictReader
6
+ from io import StringIO as StringBuffer
7
+ from dataclasses import asdict
8
+ from dctap.exceptions import DctapError, NoDataError
9
+ from dctap.tapclasses import TAPShape, TAPStatementTemplate
10
+ from dctap.utils import coerce_concise
11
+
12
+
13
+ def csvreader(
14
+ csvfile_str=None,
15
+ config_dict=None,
16
+ open_csvfile_obj=None,
17
+ shape_class=TAPShape,
18
+ state_class=TAPStatementTemplate,
19
+ ):
20
+ """From open CSV file object, return shapes dict."""
21
+ if csvfile_str:
22
+ (csvrows, csvwarns) = _get_rows(
23
+ csvfile_str=csvfile_str,
24
+ config_dict=config_dict,
25
+ )
26
+ elif open_csvfile_obj:
27
+ (csvrows, csvwarns) = _get_rows(
28
+ open_csvfile_obj=open_csvfile_obj,
29
+ config_dict=config_dict,
30
+ )
31
+ else:
32
+ raise DctapError("No data provided.")
33
+
34
+ (tapshapes, tapwarns) = _get_tapshapes(
35
+ rows=csvrows,
36
+ config_dict=config_dict,
37
+ shape_class=shape_class,
38
+ state_class=state_class,
39
+ )
40
+ tapwarns = {**csvwarns, **tapwarns}
41
+ prefixes_used = _get_prefixes_actually_used(csvrows)
42
+ tapshapes = _add_namespaces(tapshapes, config_dict, prefixes_used)
43
+ tapshapes = _add_tapwarns(tapshapes, tapwarns)
44
+ return tapshapes
45
+
46
+
47
+ def _add_namespaces(tapshapes=None, config_dict=None, prefixes_used=None):
48
+ """Adds key 'namespaces' to tapshapes dict."""
49
+ tapshapes["namespaces"] = {}
50
+ if config_dict.get("prefixes"):
51
+ for prefix in prefixes_used:
52
+ if config_dict["prefixes"].get(prefix):
53
+ tapshapes["namespaces"][prefix] = config_dict["prefixes"].get(prefix)
54
+ return tapshapes
55
+
56
+
57
+ def _add_tapwarns(tapshapes=None, tapwarns=None):
58
+ """Adds key 'warnings' to tapshapes dict."""
59
+ tapshapes["warnings"] = tapwarns
60
+ return tapshapes
61
+
62
+
63
+ def _get_prefixes_actually_used(csvrows):
64
+ """List strings before colons in values of elements that could take URI prefixes."""
65
+ prefixes = set()
66
+ for row in csvrows:
67
+ for element in [
68
+ "shapeID",
69
+ "propertyID",
70
+ "valueDataType",
71
+ "valueShape",
72
+ ]:
73
+ if row.get(element):
74
+ prefix_plus_uri_pair = re.match(r"([^:]*):", row.get(element))
75
+ if prefix_plus_uri_pair: # if there is at least one
76
+ prefix_as_provided = prefix_plus_uri_pair.group(0)
77
+ prefixes.add(prefix_as_provided)
78
+ if row.get("valueConstraint"):
79
+ pattern = r"\b\w+:"
80
+ used_in_valueconstraint = re.findall(pattern, row.get("valueConstraint"))
81
+ prefixes = set(list(prefixes) + list(used_in_valueconstraint))
82
+ return list(prefixes)
83
+
84
+
85
+ def _get_rows(
86
+ csvfile_str=None,
87
+ config_dict=None,
88
+ open_csvfile_obj=None,
89
+ ):
90
+ """Extract from _io.TextIOWrapper object a list of CSV file rows as dicts."""
91
+ # pylint: disable=too-many-locals
92
+ # pylint: disable=too-many-branches
93
+ if csvfile_str:
94
+ csvfile_contents_str = csvfile_str
95
+ elif open_csvfile_obj:
96
+ csvfile_contents_str = open_csvfile_obj.read()
97
+ else:
98
+ raise NoDataError("No data to process.")
99
+
100
+ tmp_buffer = StringBuffer(csvfile_contents_str)
101
+ csvlines_stripped = [line.strip() for line in tmp_buffer]
102
+ csvlines_stripped = [
103
+ line for line in csvlines_stripped if not re.match("#", line.strip())
104
+ ]
105
+ if len(csvlines_stripped) < 2:
106
+ raise NoDataError("No data to process.")
107
+ raw_header_line_list = csvlines_stripped[0].split(",")
108
+ new_header_line_list = []
109
+
110
+ recognized_elements = config_dict.get("csv_elements")
111
+ xtra_shems = config_dict.get("extra_shape_elements")
112
+ xtra_stems = config_dict.get("extra_statement_template_elements")
113
+ if xtra_shems:
114
+ recognized_elements.extend(xtra_shems)
115
+ for element in xtra_shems:
116
+ config_dict["element_aliases"][element.lower()] = element
117
+ if xtra_stems:
118
+ recognized_elements.extend(xtra_stems)
119
+ for element in xtra_stems:
120
+ config_dict["element_aliases"][element.lower()] = element
121
+ recognized_elements = [elem.lower() for elem in recognized_elements]
122
+
123
+ for column in raw_header_line_list:
124
+ column = coerce_concise(column)
125
+ column = _normalize_element_name(column, config_dict.get("element_aliases"))
126
+ new_header_line_list.append(column)
127
+
128
+ csv_warns = defaultdict(dict)
129
+ for column in new_header_line_list:
130
+ if column.lower() not in recognized_elements:
131
+ warn = f"Non-DCTAP element '{column}' not configured as extra element."
132
+ csv_warns["csv"] = {}
133
+ csv_warns["csv"]["column"] = []
134
+ csv_warns["csv"]["column"].append(warn)
135
+
136
+ new_header_line_str = ",".join(new_header_line_list)
137
+ csvlines_stripped[0] = new_header_line_str
138
+ if not csvlines_stripped[0]:
139
+ raise NoDataError("No data to process.")
140
+ if "propertyID" not in csvlines_stripped[0]:
141
+ raise DctapError("Valid DCTAP CSV must have a 'propertyID' column.")
142
+
143
+ tmp_buffer2 = StringBuffer("".join([line + "\n" for line in csvlines_stripped]))
144
+ csv_rows = list(DictReader(tmp_buffer2))
145
+ for row in csv_rows:
146
+ for key, value in row.items():
147
+ if isinstance(value, str): # ignore if instance of NoneType or list
148
+ row[key] = value.strip()
149
+ csv_warns = dict(csv_warns)
150
+ return (csv_rows, csv_warns)
151
+
152
+
153
+ def _get_tapshapes(rows=None, config_dict=None, shape_class=None, state_class=None):
154
+ """Return tuple: (shapes dict, warnings dict)."""
155
+ # pylint: disable=too-many-locals
156
+ # pylint: disable=too-many-branches
157
+ # pylint: disable=too-many-statements
158
+
159
+ default_shape_id = config_dict["default_shape_identifier"]
160
+ main_stems = config_dict.get("statement_template_elements")
161
+ xtra_stems = config_dict.get("extra_statement_template_elements")
162
+ shapes = {} # dict for shapeID-to-TAPShape_list
163
+ warns = defaultdict(dict) # dict for shapeID-to-warnings_list
164
+
165
+ for row in rows:
166
+ shape_id = ""
167
+ if row.get("propertyID"):
168
+ if row.get("shapeID"):
169
+ shape_id = row.get("shapeID")
170
+ elif not row.get("shapeID"):
171
+ try:
172
+ shape_id = list(shapes)[-1]
173
+ except IndexError:
174
+ shape_id = row["shapeID"] = default_shape_id
175
+ elif row.get("shapeID"):
176
+ shape_id = row.get("shapeID")
177
+
178
+ if shape_id:
179
+ if shape_id not in list(shapes):
180
+ shape_obj = _make_shape(
181
+ row_dict=row,
182
+ config_dict=config_dict,
183
+ shape_class=shape_class,
184
+ )
185
+ shape_obj.normalize(config_dict)
186
+ shapes[shape_id] = shape_obj
187
+ warns[shape_id] = {}
188
+
189
+ shape_warnings = shape_obj.get_warnings()
190
+ for (elem, warn) in shape_warnings.items():
191
+ try:
192
+ warns[shape_id][elem].append(warn)
193
+ except KeyError:
194
+ warns[shape_id][elem] = []
195
+ warns[shape_id][elem].append(warn)
196
+
197
+ if not row.get("propertyID"):
198
+ continue
199
+
200
+ state_class_obj = state_class()
201
+ for col in row:
202
+ if col in main_stems:
203
+ setattr(state_class_obj, col, row[col])
204
+ elif col in xtra_stems:
205
+ state_class_obj.state_extras[col] = row[col]
206
+
207
+ state_class_obj.normalize(config_dict)
208
+ shapes[shape_id].state_list.append(state_class_obj)
209
+
210
+ warns_dict = dict(warns)
211
+ shapes_dict = {}
212
+ shapes_dict["shapes"] = []
213
+
214
+ for shape_obj in list(shapes.values()):
215
+ sh_dict = asdict(shape_obj)
216
+ sh_dict["statement_templates"] = sh_dict.pop("state_list")
217
+ shapes_dict["shapes"].append(sh_dict)
218
+
219
+ shapes_dict = _simplify(shapes_dict)
220
+
221
+ return (shapes_dict, warns_dict)
222
+
223
+
224
+ def _make_shape(row_dict=None, config_dict=None, shape_class=None):
225
+ """Populates shape fields of dataclass shape object from dict for one row.
226
+
227
+ Args:
228
+ row_dict: Dictionary of all columns headers (keys) and cell values (values)
229
+ found in a given row, with no distinction between shape elements and
230
+ statement template elements.
231
+ config_dict: Dictionary of settings, built-in or as read from config file.
232
+
233
+ Returns:
234
+ Unpopulated instance of shape class, for example:
235
+ TAPShape(shapeID='', state_list=[], shape_warns={}, state_extras={}, ...)
236
+ """
237
+ main_shems = config_dict.get("shape_elements")
238
+ xtra_shems = config_dict.get("extra_shape_elements")
239
+ tapshape_obj = shape_class()
240
+ for key in row_dict:
241
+ if key in main_shems:
242
+ setattr(tapshape_obj, key, row_dict[key])
243
+ elif key in xtra_shems:
244
+ tapshape_obj.shape_extras[key] = row_dict[key]
245
+ return tapshape_obj
246
+
247
+
248
+ def _normalize_element_name(some_str, element_aliases_dict=None):
249
+ """Given header string, return converted if aliased, else return unchanged."""
250
+ some_str = coerce_concise(some_str)
251
+ if element_aliases_dict:
252
+ for key in element_aliases_dict.keys():
253
+ if key == some_str:
254
+ some_str = element_aliases_dict[key]
255
+ return some_str
256
+
257
+
258
+ def _simplify(shapes_dict):
259
+ """Remove elements from shapes dictionary with falsy values."""
260
+ for shape in shapes_dict["shapes"]:
261
+ for state in shape["statement_templates"]:
262
+ if state.get("state_extras"):
263
+ for (k, v) in state["state_extras"].items():
264
+ state[k] = v
265
+ del state["state_extras"]
266
+ if state.get("state_warns"):
267
+ del state["state_warns"]
268
+ for empty_element in [key for key in state if not state[key]]:
269
+ del state[empty_element]
270
+ if shape.get("shape_extras"):
271
+ for (k, v) in shape["shape_extras"].items():
272
+ shape[k] = v
273
+ del shape["shape_extras"]
274
+ if shape.get("shape_warns"):
275
+ del shape["shape_warns"]
276
+ for empty_element in [key for key in shape if not shape[key]]:
277
+ del shape[empty_element]
278
+ return shapes_dict
dctap/defaults.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Default settings."""
2
+
3
+ CONFIGFILE = "dctap.yaml"
4
+ CONFIGYAML = """\
5
+ # dctap configuration file (in YAML format)
6
+ # See https://dctap-python.readthedocs.io/en/latest/config/ for more options
7
+
8
+ prefixes:
9
+ ":": "http://example.org/"
10
+ "dc:": "http://purl.org/dc/elements/1.1/"
11
+ "dcterms:": "http://purl.org/dc/terms/"
12
+ "dct:": "http://purl.org/dc/terms/"
13
+ "foaf:": "http://xmlns.com/foaf/0.1/"
14
+ "owl:": "http://www.w3.org/2002/07/owl#"
15
+ "rdf:": "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
16
+ "rdfs:": "http://www.w3.org/2000/01/rdf-schema#"
17
+ "schema:": "http://schema.org/"
18
+ "skos:": "http://www.w3.org/2004/02/skos/core#"
19
+ "skosxl:": "http://www.w3.org/2008/05/skos-xl#"
20
+ "wdt:": "http://www.wikidata.org/prop/direct/"
21
+ "xsd:": "http://www.w3.org/2001/XMLSchema#"
22
+ """
dctap/exceptions.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Exception classes for dctap."""
2
+
3
+
4
+ class DctapError(SystemExit):
5
+ """Exceptions related to Dctap generally."""
6
+
7
+
8
+ class ConfigError(DctapError):
9
+ """Exceptions related to configuration."""
10
+
11
+
12
+ class NoDataError(DctapError):
13
+ """Exception raised if there is no (TAP) data to process."""
dctap/inspect.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Print CSV contents and warnings."""
2
+
3
+ import sys
4
+ from dctap.loggers import stderr_logger
5
+
6
+
7
+ def pprint_tapshapes(tapshapes_dict=None, config_dict=None):
8
+ """Pretty-print TAPShape objects to output list, ready for printing to console."""
9
+ # pylint: disable=too-many-branches
10
+ main_shems = config_dict.get("shape_elements")
11
+ xtra_shems = config_dict.get("extra_shape_elements")
12
+ main_stems = config_dict.get("statement_template_elements")
13
+ xtra_stems = config_dict.get("extra_statement_template_elements")
14
+ pprint_output = []
15
+ pprint_output.append("Tabular Application Profile (TAP)")
16
+ for tapshape_dict in tapshapes_dict.get("shapes"):
17
+ pprint_output.append(" Shape")
18
+ for key in main_shems:
19
+ indent08 = 8 * " " + key + " "
20
+ while len(indent08) < 33:
21
+ indent08 += " "
22
+ if tapshape_dict.get(key):
23
+ pprint_output.append(indent08 + str(tapshape_dict.get(key)))
24
+ for key in xtra_shems:
25
+ indent08 = 8 * " " + "[" + key + "] "
26
+ while len(indent08) < 33:
27
+ indent08 += " "
28
+ if tapshape_dict.get(key):
29
+ pprint_output.append(indent08 + str(tapshape_dict.get(key)))
30
+
31
+ for sc_dict in tapshape_dict.get("statement_templates"):
32
+ pprint_output.append(" Statement Template")
33
+ for key in main_stems:
34
+ if sc_dict.get(key):
35
+ indent12 = 12 * " " + key + " "
36
+ while len(indent12) < 33:
37
+ indent12 += " "
38
+ pprint_output.append(indent12 + str(sc_dict.get(key)))
39
+ for key in xtra_stems:
40
+ indent08 = 12 * " " + "[" + key + "] "
41
+ while len(indent08) < 33:
42
+ indent08 += " "
43
+ if sc_dict.get(key):
44
+ pprint_output.append(indent08 + str(sc_dict.get(key)))
45
+
46
+ return pprint_output
47
+
48
+
49
+ def print_warnings(warnings_dict):
50
+ """Print warnings to stdout."""
51
+ # pylint: disable=logging-fstring-interpolation
52
+ print("", file=sys.stderr)
53
+ echo = stderr_logger()
54
+ for (shapeid, warns) in warnings_dict.items():
55
+ for (elem, warn_list) in warns.items():
56
+ for warning in warn_list:
57
+ echo.warning(f"[{shapeid}/{elem}] {warning}")
dctap/loggers.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Loggers: warnings to stderr, warnings to file, verbose debug info to file."""
2
+
3
+ import logging
4
+ import sys
5
+
6
+ PLAIN_FORMATTER = logging.Formatter("%(levelname)s %(message)s")
7
+
8
+ TIMESTAMP_FORMATTER = logging.Formatter(
9
+ "%(asctime)s %(levelname)s %(message)s", "%Y-%m-%d %H:%M"
10
+ )
11
+
12
+
13
+ def stderr_logger():
14
+ """Initialize logger for printing messages, INFO and higher, to standard error."""
15
+ stderr_info_logger = logging.getLogger("stderr_logger")
16
+ stderr_info_logger.setLevel(logging.INFO)
17
+ stderr_info_logger_handler = logging.StreamHandler(sys.stderr)
18
+ stderr_info_logger_handler.setLevel(logging.INFO)
19
+ stderr_info_logger.addHandler(stderr_info_logger_handler)
20
+ stderr_info_logger_handler.setFormatter(PLAIN_FORMATTER)
21
+ return stderr_info_logger
22
+
23
+
24
+ def warning_logger(output_filename="warnings.log"):
25
+ """Logger for printing time-stamped messages, WARNING and higher, to file."""
26
+ warningfile_logger = logging.getLogger("warningfile_logger")
27
+ warningfile_logger.setLevel(logging.INFO)
28
+ warningfile_logger_handler = logging.FileHandler(output_filename, mode="w")
29
+ warningfile_logger_handler.setLevel(logging.WARNING)
30
+ warningfile_logger.addHandler(warningfile_logger_handler)
31
+ warningfile_logger_handler.setFormatter(TIMESTAMP_FORMATTER)
32
+ return warningfile_logger
33
+
34
+
35
+ def debug_logger(output_filename="debug.log"):
36
+ """Logger for printing time-stamped messages, DEBUG and higher, to file."""
37
+ debugfile_logger = logging.getLogger("debugfile_logger")
38
+ debugfile_logger.setLevel(logging.DEBUG)
39
+ debugfile_logger_handler = logging.FileHandler(output_filename, mode="w")
40
+ debugfile_logger_handler.setLevel(logging.DEBUG)
41
+ debugfile_logger.addHandler(debugfile_logger_handler)
42
+ debugfile_logger_handler.setFormatter(TIMESTAMP_FORMATTER)
43
+ return debug_logger
dctap/tapclasses.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Classes for Python objects derived from CSV files."""
2
+
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from typing import List
6
+ from dctap.utils import coerce_integer, coerce_numeric, looks_like_uri_or_curie
7
+
8
+
9
+ @dataclass
10
+ class TAPStatementTemplate:
11
+ """Instances hold TAP/CSV elements related to statement templates."""
12
+
13
+ # pylint: disable=too-many-instance-attributes # It's a dataclass, right?
14
+ # pylint: disable=invalid-name # for elements not named in snake case.
15
+
16
+ propertyID: str = ""
17
+ propertyLabel: str = ""
18
+ mandatory: str = ""
19
+ repeatable: str = ""
20
+ valueNodeType: str = ""
21
+ valueDataType: str = ""
22
+ valueConstraint: str = ""
23
+ valueConstraintType: str = ""
24
+ valueShape: str = ""
25
+ note: str = ""
26
+ state_warns: dict = field(default_factory=dict)
27
+ state_extras: dict = field(default_factory=dict)
28
+
29
+ def normalize(self, config_dict):
30
+ """Normalizes specific fields."""
31
+ # pylint: disable=attribute-defined-outside-init
32
+ self._normalize_booleans()
33
+ self._valueConstraintType_pattern_warn_if_valueConstraint_not_valid_regex()
34
+ self._valueConstraintType_pattern_warn_if_used_with_value_shape()
35
+ self._valueConstraintType_iristem_parse()
36
+ self._valueConstraintType_iristem_warn_if_list_items_not_IRIs()
37
+ self._valueConstraintType_languageTag_parse(config_dict)
38
+ self._valueConstraintType_minmaxlength_warn_if_not_nonnegative_integer()
39
+ self._valueConstraintType_minmaxinclusive_parse()
40
+ self._valueConstraintType_minmaxinclusive_warn_if_value_not_numeric()
41
+ self._valueConstraintType_warn_if_used_without_valueConstraint()
42
+ self._valueDataType_warn_if_used_with_valueNodeType_IRI()
43
+ self._valueNodeType_warn_if_valueNodeType_literal_used_with_any_valueShape()
44
+ self._valueConstraintType_picklist_parse(config_dict)
45
+ self._valueNodeType_is_from_enumerated_list(config_dict)
46
+ self._parse_elements_configured_as_picklist_elements(config_dict)
47
+ return self
48
+
49
+ def _warn_if_value_not_urilike(self):
50
+ """Warns when values of given elements do not look like URIs."""
51
+ elements_that_may_take_uris = ["propertyID", "valueDataType", "valueShape"]
52
+ for elem in elements_that_may_take_uris:
53
+ value = getattr(self, elem)
54
+ warning = f"Value '{value}' does not look like a URI."
55
+ if value:
56
+ if not looks_like_uri_or_curie(value):
57
+ self.state_warns[elem] = warning
58
+ return self
59
+
60
+ def _normalize_booleans(self):
61
+ """Coerces supported Boolean values to 'true' or 'false' or leaves unchanged."""
62
+ valid_values_for_true = ["true", "TRUE", "True", "1"]
63
+ valid_values_for_false = ["false", "FALSE", "False", "0"]
64
+ boolean_elements = ["mandatory", "repeatable"]
65
+ for elem in boolean_elements:
66
+ value = getattr(self, elem)
67
+ if value:
68
+ warning_message = f"'{value}' is not a supported Boolean value."
69
+ if value in valid_values_for_true:
70
+ setattr(self, elem, "true")
71
+ elif value in valid_values_for_false:
72
+ setattr(self, elem, "false")
73
+ else:
74
+ self.state_warns[elem] = warning_message
75
+ return self
76
+
77
+ def _valueConstraintType_iristem_parse(self):
78
+ """If valueConstraintType is Iristem, split valueConstraint on whitespace."""
79
+ self.valueConstraintType = self.valueConstraintType.lower()
80
+ if self.valueConstraintType == "iristem":
81
+ if self.valueConstraint:
82
+ self.valueConstraint = self.valueConstraint.split()
83
+ return self
84
+
85
+ def _valueConstraintType_iristem_warn_if_list_items_not_IRIs(self):
86
+ """If IRIStem, warn if valueConstraint list items do not look like IRIs."""
87
+ self.valueConstraintType = self.valueConstraintType.lower()
88
+ if self.valueConstraintType == "iristem":
89
+ for list_item in self.valueConstraint:
90
+ if not looks_like_uri_or_curie(list_item):
91
+ self.state_warns["valueConstraint"] = (
92
+ f"Value constraint type is '{self.valueConstraintType}', "
93
+ f"but '{list_item}' does not look like an IRI or "
94
+ "Compact IRI."
95
+ )
96
+ return self
97
+
98
+ def _valueConstraintType_minmaxlength_warn_if_not_nonnegative_integer(self):
99
+ """
100
+ Tries to coerce valueConstraint to integer (or leaves string untouched).
101
+ Warns if valueConstraint for minLength is not a nonnegative integer.
102
+ """
103
+ vctype = self.valueConstraintType.lower()
104
+ vc = self.valueConstraint = coerce_integer(self.valueConstraint)
105
+ bad_vc_warning = (
106
+ f"Value constraint type is '{self.valueConstraintType}', "
107
+ f"but '{self.valueConstraint}' is not a positive integer."
108
+ )
109
+ if vctype in ("minlength", "maxlength"):
110
+ if isinstance(vc, int):
111
+ if vc < 0:
112
+ self.state_warns["valueConstraint"] = bad_vc_warning
113
+ elif not isinstance(vc, int):
114
+ self.state_warns["valueConstraint"] = bad_vc_warning
115
+
116
+ return self
117
+
118
+ def _valueConstraintType_minmaxinclusive_parse(self):
119
+ """
120
+ If value of valueConstraintType is 'minInclusive' or 'maxInclusive',
121
+ value of valueConstraint should be numeric (int or float).
122
+ """
123
+ self.valueConstraintType = self.valueConstraintType.lower()
124
+ value_constraint = self.valueConstraint
125
+ if self.valueConstraintType in ("mininclusive", "maxinclusive"):
126
+ if value_constraint:
127
+ self.valueConstraint = coerce_numeric(value_constraint)
128
+ return self
129
+
130
+ def _valueConstraintType_minmaxinclusive_warn_if_value_not_numeric(self):
131
+ """Warns if valueConstraint for minInclusive not coercable to float."""
132
+ self.valueConstraintType = self.valueConstraintType.lower()
133
+ if self.valueConstraintType in ("mininclusive", "maxinclusive"):
134
+ try:
135
+ float(self.valueConstraint)
136
+ except (ValueError, TypeError):
137
+ self.state_warns["valueConstraint"] = (
138
+ f"Value constraint type is '{self.valueConstraintType}', "
139
+ f"but '{self.valueConstraint}' is not numeric."
140
+ )
141
+ return self
142
+
143
+ def _valueConstraintType_pattern_warn_if_valueConstraint_not_valid_regex(self):
144
+ """If valueConstraintType Pattern, warn if valueConstraint not valid regex."""
145
+ self.valueConstraintType = self.valueConstraintType.lower()
146
+ if self.valueConstraintType == "pattern":
147
+ try:
148
+ re.compile(self.valueConstraint)
149
+ except (re.error, TypeError):
150
+ self.state_warns["valueConstraint"] = (
151
+ f"Value constraint type is '{self.valueConstraintType}', but "
152
+ f"'{self.valueConstraint}' is not a valid regular expression."
153
+ )
154
+ return self
155
+
156
+ def _valueConstraintType_pattern_warn_if_used_with_value_shape(self):
157
+ """Regular expressions cannot conform to value shapes."""
158
+ self.valueConstraintType = self.valueConstraintType.lower()
159
+ if self.valueConstraintType == "pattern":
160
+ if self.valueShape:
161
+ self.state_warns["valueConstraintType"] = (
162
+ f"Values of constraint type '{self.valueConstraintType}' "
163
+ "cannot conform to a value shape."
164
+ )
165
+
166
+ def _valueConstraintType_languageTag_parse(self, config_dict):
167
+ """For valueConstraintType languageTag, splits valueConstraint on whitespace."""
168
+ self.valueConstraintType = self.valueConstraintType.lower()
169
+ sep = config_dict.get("picklist_item_separator", " ")
170
+ if self.valueConstraintType == "languagetag":
171
+ if self.valueConstraint:
172
+ self.valueConstraint = self.valueConstraint.split(sep)
173
+ self.valueConstraint = [x.strip() for x in self.valueConstraint if x]
174
+ return self
175
+
176
+ def _valueConstraintType_warn_if_used_without_valueConstraint(self):
177
+ """Warns if valueConstraintType used without valueConstraint."""
178
+ if self.valueConstraintType:
179
+ if not self.valueConstraint:
180
+ self.state_warns["valueConstraint"] = (
181
+ f"Value constraint type '{self.valueConstraintType}' "
182
+ "has no corresponding value constraint."
183
+ )
184
+ return self
185
+
186
+ def _valueConstraintType_picklist_parse(self, config_dict):
187
+ """If valueConstraintType is Picklist, split valueConstraint on whitespace."""
188
+ self.valueConstraintType = self.valueConstraintType.lower()
189
+ sep = config_dict.get("picklist_item_separator", " ")
190
+ if self.valueConstraintType == "picklist":
191
+ if self.valueConstraint:
192
+ self.valueConstraint = self.valueConstraint.split(sep)
193
+ self.valueConstraint = [x.strip() for x in self.valueConstraint if x]
194
+ return self
195
+
196
+ def _valueNodeType_is_from_enumerated_list(self, config_dict):
197
+ """Take valueNodeType from configurable enumerated list, case-insensitive."""
198
+ warning = f"'{self.valueNodeType}' is not a valid node type."
199
+ valid_types = ["iri", "bnode", "literal"]
200
+ # This should be moved out to defaults dictionary.
201
+ if config_dict.get("extra_value_node_types"):
202
+ valid_types += [v.lower() for v in config_dict["extra_value_node_types"]]
203
+ if self.valueNodeType:
204
+ self.valueNodeType = self.valueNodeType.lower()
205
+ if self.valueNodeType not in valid_types:
206
+ self.state_warns["valueNodeType"] = warning
207
+ return self
208
+
209
+ def _valueNodeType_warn_if_valueNodeType_literal_used_with_any_valueShape(self):
210
+ """Value with node type Literal cannot conform to a value shape."""
211
+ warning = "Values of node type 'literal' cannot conform to value shapes."
212
+ self.valueNodeType = self.valueNodeType.lower()
213
+ if self.valueShape:
214
+ if self.valueNodeType == "literal":
215
+ self.state_warns["valueDataType"] = warning
216
+ return self
217
+
218
+ def _valueDataType_warn_if_used_with_valueShape(self):
219
+ """Value with any datatype cannot conform to a value shape."""
220
+ warning = "Values with datatypes (literals) cannot conform to value shapes."
221
+ if self.valueShape:
222
+ if self.valueDataType:
223
+ self.state_warns["valueDataType"] = warning
224
+ return self
225
+
226
+ def _valueDataType_warn_if_used_with_valueNodeType_IRI(self):
227
+ """Value with datatype implies Literal and cannot be node type IRI."""
228
+ node_type = self.valueNodeType
229
+ data_type = self.valueDataType
230
+ warning = f"Datatype '{data_type}' incompatible with node type '{node_type}'."
231
+ node_type = self.valueNodeType.lower()
232
+ if node_type in ("iri", "uri", "bnode"):
233
+ if self.valueDataType:
234
+ self.state_warns["valueDataType"] = warning
235
+ return self
236
+
237
+ def _parse_elements_configured_as_picklist_elements(self, config_dict):
238
+ """Parse elements configured as list elementss."""
239
+ if config_dict.get("picklist_item_separator"):
240
+ separator = config_dict.get("picklist_item_separator")
241
+ else:
242
+ separator = " "
243
+
244
+ if config_dict.get("picklist_elements"):
245
+ picklist_elements = config_dict.get("picklist_elements")
246
+ else:
247
+ picklist_elements = []
248
+
249
+ for element in picklist_elements:
250
+ if getattr(self, element):
251
+ setattr(self, element, getattr(self, element).split(separator))
252
+
253
+ return self
254
+
255
+ def get_warnings(self):
256
+ """Emit self.state_warns as populated by self.normalize()."""
257
+ return dict(self.state_warns)
258
+
259
+
260
+ @dataclass
261
+ class TAPShape:
262
+ """An instance holds TAP/CSV row elements related to one given, named shape."""
263
+
264
+ # pylint: disable=invalid-name
265
+ # True that propertyID, etc, do not conform to snake-case naming style.
266
+
267
+ shapeID: str = ""
268
+ shapeLabel: str = ""
269
+ state_list: List[TAPStatementTemplate] = field(default_factory=list)
270
+ shape_warns: dict = field(default_factory=dict)
271
+ shape_extras: dict = field(default_factory=dict)
272
+
273
+ def normalize(self, config_dict):
274
+ """Normalize values where required."""
275
+ self._normalize_default_shapeID(config_dict)
276
+ self._warn_if_value_not_urilike()
277
+ return self
278
+
279
+ def _normalize_default_shapeID(self, config_dict):
280
+ """If shapeID not specified, looks first in config, else sets "default"."""
281
+ if not self.shapeID:
282
+ self.shapeID = config_dict.get("default_shape_identifier", "default")
283
+ return self
284
+
285
+ def _warn_if_value_not_urilike(self):
286
+ """Warns when values of given elements do not look like URIs."""
287
+ elements_that_may_take_uris = ["shapeID"]
288
+ for elem in elements_that_may_take_uris:
289
+ value = getattr(self, elem)
290
+ warning = f"Value '{value}' does not look like a URI."
291
+ if value:
292
+ if not looks_like_uri_or_curie(value):
293
+ self.shape_warns[elem] = warning
294
+ return self
295
+
296
+ def get_warnings(self):
297
+ """Emit warnings dictionary self.shape_warns, populated by normalize() method."""
298
+ return dict(self.shape_warns)
dctap/utils.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utilities."""
2
+
3
+ import sys
4
+ import re
5
+ from pathlib import Path
6
+ from urllib.parse import urlparse
7
+ from ruamel.yaml import YAML, YAMLError
8
+ from ruamel.yaml.scanner import ScannerError
9
+ from ruamel.yaml.parser import ParserError
10
+ from dctap.exceptions import ConfigError, DctapError
11
+
12
+
13
+ def load_yaml_to_dict(yamlstring=None, yamlfile=None):
14
+ """Convert YAML from string or file (filename or Path object) into Python dict."""
15
+ dict_from_yamlstring = {}
16
+ if yamlfile and yamlstring:
17
+ raise DctapError("Can load YAML from string or file, but not both.")
18
+
19
+ if yamlfile:
20
+ try:
21
+ yamlstring = Path(yamlfile).read_text(encoding="UTF-8")
22
+ except FileNotFoundError:
23
+ print(f"File '{yamlfile}' not found.", file=sys.stderr)
24
+
25
+ if yamlstring is not None:
26
+ yaml = YAML(typ="safe", pure=True)
27
+ try:
28
+ dict_from_yamlstring = yaml.load(yamlstring)
29
+ except (YAMLError, ScannerError, ParserError):
30
+ dict_from_yamlstring = None
31
+ if yamlfile:
32
+ print(f"YAML in '{yamlfile}' is badly formed.", file=sys.stderr)
33
+ else:
34
+ print("YAML is badly formed.", file=sys.stderr)
35
+
36
+ return dict_from_yamlstring
37
+
38
+
39
+ def coerce_concise(some_str=None):
40
+ """
41
+ For given string:
42
+ - delete spaces, underscores, dashes, commas
43
+ - lowercase
44
+ - delete surrounding single and double quotes
45
+ """
46
+ some_str = some_str.replace(" ", "")
47
+ some_str = some_str.replace("_", "")
48
+ some_str = some_str.replace("-", "")
49
+ some_str = some_str.replace(",", "")
50
+ some_str = some_str.lower()
51
+ some_str = some_str.strip('"')
52
+ some_str = some_str.strip("'")
53
+ return some_str
54
+
55
+
56
+ def coerce_integer(value_constraint=None):
57
+ """Coerces string to integer or returns string untouched."""
58
+ try:
59
+ value_constraint = int(value_constraint)
60
+ except (ValueError, TypeError):
61
+ pass # pass the valueConstraint through untouched
62
+ return value_constraint
63
+
64
+
65
+ def coerce_numeric(value_constraint=None):
66
+ """Coerces string to numeric type or returns string untouched."""
67
+ try:
68
+ if value_constraint == str(float(value_constraint)):
69
+ value_constraint = float(value_constraint)
70
+ else:
71
+ value_constraint = int(value_constraint)
72
+ except (ValueError, TypeError):
73
+ pass # pass the valueConstraint through untouched
74
+ return value_constraint
75
+
76
+
77
+ def expand_uri_prefixes(shapes_dict=None, config_dict=None):
78
+ """Expand namespace prefixes, eg: dc:date to http://purl.org/dc/terms/date."""
79
+ # pylint: disable=too-many-nested-blocks
80
+ if not config_dict.get("prefixes"):
81
+ raise ConfigError("No 'prefixes' section found in config file.")
82
+ for shape in shapes_dict["shapes"]:
83
+ for prefix in config_dict["prefixes"]:
84
+ if re.match(prefix, shape["shapeID"]):
85
+ prefix_expanded = config_dict["prefixes"][prefix]
86
+ shape["shapeID"] = re.sub(prefix, prefix_expanded, shape["shapeID"])
87
+ for sc in shape["statement_templates"]:
88
+ for element in ["propertyID", "valueDataType", "valueShape"]:
89
+ if sc.get(element):
90
+ for prefix in config_dict["prefixes"]:
91
+ if re.match(prefix, sc.get(element)):
92
+ prefix_expanded = config_dict["prefixes"][prefix]
93
+ sc[element] = re.sub(prefix, prefix_expanded, sc[element])
94
+ return shapes_dict
95
+
96
+
97
+ def looks_like_uri_or_curie(url_string):
98
+ """True if string superficially looks like a URI or Compact URI."""
99
+ if not isinstance(url_string, str):
100
+ return False
101
+
102
+ url_parsed = urlparse(url_string)
103
+ has_prefix = bool(url_parsed.scheme) # could be URI scheme or CURIE prefix
104
+ has_net_location = bool(url_parsed.netloc) # something with a dot
105
+ has_name = bool(re.match(r"^:", url_parsed.path)) # starts with a colon
106
+
107
+ if has_prefix and has_net_location:
108
+ return True
109
+ if has_prefix:
110
+ return True
111
+ if has_name:
112
+ return True
113
+ return False