from collections import OrderedDict import pymarc def get_record_values(record, location): split = location.split("$") if len(split) == 1: tag = split[0] code = None elif len(split) == 2: tag, code = split else: raise ValueError("Invalid location") # Find fields matching tag fields = record.get_fields(tag) results = [] for current_value in fields: if current_value is not None: if code is not None: values = current_value.get_subfields(code) results.extend(values) elif isinstance(current_value, pymarc.Field): results.append(current_value.value()) return " ".join(results) def record_dict(record): d = OrderedDict() # Dump every field value into a string d["raw"] = " ".join([f.value() for f in record.fields]) d["cid"] = get_record_values(record, "CID") d["id"] = get_record_values(record, "001") fixed_data = get_record_values(record, "008") d["pub_date"] = fixed_data[7:11] d["pub_place"] = fixed_data[15:18] d["language"] = fixed_data[35:38] d["title_a"] = get_record_values(record, "245$a") d["title_b"] = get_record_values(record, "245$b") d["title_c"] = get_record_values(record, "245$c") d["title_p"] = get_record_values(record, "245$p") d["title"] = " ".join([d["title_a"], d["title_b"], d["title_p"]]) d["title_variation_a"] = get_record_values(record, "246$a") d["title_variation_b"] = get_record_values(record, "246$b") d["subject_headings"] = " ".join( get_record_values(record, "650$a") + get_record_values(record, "650$x") ) d["author_names"] = " ".join( [get_record_values(record, "100$a"), get_record_values(record, "700$a")] ) d["corporate_names"] = " ".join( [get_record_values(record, "110$a"), get_record_values(record, "710$a")] ) d["meeting_names"] = " ".join( [get_record_values(record, "111$a"), get_record_values(record, "711$a")] ) d["publisher"] = record.publisher or "" d["pagination"] = get_record_values(record, "300$a") d["dimensions"] = get_record_values(record, "300$c") return d def load_records(path): records = [] extension = path.split(".")[-1] if extension == "mrc" or extension == "marc": with open(path, "rb") as marcfile: reader = pymarc.MARCReader(marcfile) records.extend(list(reader)) elif extension == "json": with open(path, "r") as jsonfile: for line in jsonfile: record = pymarc.parse_json_to_array(line)[0] records.append(record) else: raise ValueError(f"Unsupported file extension: {extension}") return records