Spaces:

cdlib
/

marc-match-ai-demo

Sleeping

App Files Files Community

RvanB commited on Apr 24

Commit

fbf7e95

•

1 Parent(s): 891c20f

Add files from other repo

Browse files

Files changed (19) hide show

.gitignore +3 -0
config.yaml +31 -0
marcai/__init__.py +0 -0
marcai/find_matches.py +73 -0
marcai/pl/__init__.py +2 -0
marcai/pl/attribute_selector.py +12 -0
marcai/pl/marc_data_module.py +51 -0
marcai/pl/similarity_vector_dataset.py +26 -0
marcai/pl/similarity_vector_model.py +90 -0
marcai/predict.py +75 -0
marcai/process.py +269 -0
marcai/processing/__init__.py +1 -0
marcai/processing/comparisons.py +249 -0
marcai/processing/normalizations.py +36 -0
marcai/train.py +100 -0
marcai/utils/__init__.py +1 -0
marcai/utils/load_config.py +6 -0
marcai/utils/parsing.py +93 -0
requirements.txt +11 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__
+.ipynb_checkpoints
+.DS_Store

config.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+model:
+  # Inputs features
+  features:
+    - title_tokenset
+    - title_agg
+    - author
+    - publisher
+    - pub_date
+    - pub_place
+    - pagination
+  # Size of hidden layers
+  hidden_sizes:
+    - 32
+    - 64
+  # Training
+  batch_size: 512
+  weight_decay: 0.0
+  max_epochs: -1
+  # Disable early stopping with -1
+  patience: 20
+  lr: 0.006
+  optimizer: Adam
+  saved_models_dir: saved_models
+  # Paths to dataset splits
+  test_processed_path: data/202303_goldfinch_set_1.1/processed/test_processed.csv
+  train_processed_path: data/202303_goldfinch_set_1.1/processed/train_processed.csv
+  val_processed_path: data/202303_goldfinch_set_1.1/processed/val_processed.csv

marcai/__init__.py ADDED Viewed

File without changes

marcai/find_matches.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import argparse
+from process import multiprocess_pairs
+from predict import predict_onnx
+from tqdm import tqdm
+import pandas as pd
+from marcai.utils.parsing import load_records, record_dict
+from marcai.utils import load_config
+import csv
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--inputs", nargs="+", help="MARC files", required=True)
+    parser.add_argument(
+        "-p",
+        "--pair-indices",
+        help="File containing indices of comparisons",
+        required=True,
+    )
+    parser.add_argument("-C", "--chunksize", help="Chunk size", type=int, default=50000)
+    parser.add_argument(
+        "-P", "--processes", help="Number of processes", type=int, default=1
+    )
+    parser.add_argument(
+        "-m",
+        "--model-dir",
+        help="Directory containing model ONNX and YAML files",
+        required=True,
+    )
+    parser.add_argument("-o", "--output", help="Output file", required=True)
+    parser.add_argument("-t", "--threshold", help="Threshold for matching", type=float)
+    args = parser.parse_args()
+    config_path = f"{args.model_dir}/config.yaml"
+    model_onnx = f"{args.model_dir}/model.onnx"
+    config = load_config(config_path)
+    # Load records
+    print("Loading records...")
+    records = []
+    for path in args.inputs:
+        records.extend([record_dict(r) for r in load_records(path)])
+    records_df = pd.DataFrame(records)
+    print(f"Loaded {len(records)} records.")
+    print("Processing and comparing records...")
+    written = False
+    with open(args.pair_indices, "r") as indices_file:
+        reader = csv.reader(indices_file)
+        # Process records
+        for df in tqdm(multiprocess_pairs(
+            records_df, reader, args.chunksize, args.processes
+        )):
+            input_df = df[config["model"]["features"]]
+            prediction = predict_onnx(model_onnx, input_df)
+            df.loc[:, "prediction"] = prediction.squeeze()
+            df = df[df["prediction"] >= args.threshold]
+            if not df.empty:
+                if not written:
+                    df.to_csv(args.output, index=False)
+                    written = True
+                else:
+                    df.to_csv(args.output, index=False, mode="a", header=False)
+if __name__ == "__main__":
+    main()

marcai/pl/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .similarity_vector_model import SimilarityVectorModel
2	+ from .marc_data_module import MARCDataModule

marcai/pl/attribute_selector.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch.nn as nn
+class AttributeSelector(nn.Module):
+    def __init__(self, attrs):
+        super().__init__()
+        self.attrs = attrs
+    def forward(self, sim: dict) -> dict:
+        sim = {key: sim[key] for key in self.attrs if key in sim.keys()}
+        return sim

marcai/pl/marc_data_module.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+import torch
+from .attribute_selector import AttributeSelector
+from .similarity_vector_dataset import SimilarityVectorDataset
+from typing import List
+class MARCDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        train_processed_path: str,
+        val_processed_path: str,
+        test_processed_path: str,
+        attrs: List[str],
+        batch_size: int,
+    ):
+        super().__init__()
+        self.train_processed_path = train_processed_path
+        self.val_processed_path = val_processed_path
+        self.test_processed_path = test_processed_path
+        self.batch_size = batch_size
+        self.transform = torch.nn.Sequential(AttributeSelector(attrs))
+        self.train_set = None
+        self.val_set = None
+        self.test_set = None
+    def setup(self, stage=None):
+        self.train_set = SimilarityVectorDataset(
+            self.train_processed_path, transform=self.transform
+        )
+        self.val_set = SimilarityVectorDataset(
+            self.val_processed_path, transform=self.transform
+        )
+        self.test_set = SimilarityVectorDataset(
+            self.test_processed_path, transform=self.transform
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_set, batch_size=self.batch_size, num_workers=0, shuffle=True
+        )
+    def val_dataloader(self):
+        return DataLoader(self.val_set, batch_size=self.batch_size, num_workers=0)
+    def test_dataloader(self):
+        return DataLoader(self.test_set, batch_size=self.batch_size, num_workers=0)

marcai/pl/similarity_vector_dataset.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from torch.utils.data import Dataset
+import numpy as np
+import pandas as pd
+class SimilarityVectorDataset(Dataset):
+    def __init__(self, processed_path: str, transform=None):
+        self.transform = transform
+        self.data = pd.read_csv(processed_path)
+    def __len__(self):
+        return self.data.shape[0]
+    def __getitem__(self, idx):
+        row = self.data.iloc[idx].to_dict()
+        label = float(float(row['cid']) == 1.0)
+        if self.transform:
+            row = self.transform(row)
+        row = np.array(list(row.values())).astype(float)
+        return row, label

marcai/pl/similarity_vector_model.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from torchmetrics import Accuracy
+class SimilarityVectorModel(pl.LightningModule):
+    def __init__(self, lr, weight_decay, optimizer, batch_size, attrs, hidden_sizes):
+        super().__init__()
+        # Hyperparameters
+        self.attrs = attrs
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.optimizer = optimizer
+        self.batch_size = batch_size
+        self.save_hyperparameters()
+        # Create model layers
+        layer_sizes = [len(attrs)] + hidden_sizes + [1]
+        layers = []
+        for i in range(len(layer_sizes) - 1):
+            in_size, out_size = layer_sizes[i], layer_sizes[i + 1]
+            layers.append(nn.Linear(in_size, out_size))
+            if i < len(layer_sizes) - 2:
+                layers.append(nn.ReLU())
+        self.layers = nn.Sequential(*layers)
+        self.sigmoid = nn.Sigmoid()
+        self.criterion = nn.BCEWithLogitsLoss()
+        self.accuracy = Accuracy(task="binary")
+    def forward(self, x):
+        return self.layers(x)
+    def predict(self, x):
+        return self.sigmoid(self(x))
+    def training_step(self, batch, batch_idx):
+        sim, label = batch
+        pred = self(sim.float())
+        label = label.unsqueeze(1)
+        loss = self.criterion(pred, label)
+        acc = self.accuracy(pred, label.long())
+        self.log("train_loss", loss, on_step=False, on_epoch=True)
+        self.log("train_acc", acc, on_step=False, on_epoch=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        sim, label = batch
+        pred = self(sim.float())
+        label = label.unsqueeze(1)
+        loss = self.criterion(pred, label)
+        acc = self.accuracy(pred, label.long())
+        self.log("val_loss", loss, on_step=False, on_epoch=True)
+        self.log("val_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        sim, label = batch
+        pred = self(sim.float())
+        label = label.unsqueeze(1)
+        loss = self.criterion(pred, label)
+        acc = self.accuracy(pred, label.long())
+        self.log("test_loss", loss, on_step=False, on_epoch=True)
+        self.log("test_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def configure_optimizers(self):
+        optimizers = {
+            "Adadelta": torch.optim.Adadelta,
+            "Adagrad": torch.optim.Adagrad,
+            "Adam": torch.optim.Adam,
+            "RMSprop": torch.optim.RMSprop,
+            "SGD": torch.optim.SGD,
+        }
+        return optimizers[self.optimizer](
+            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
+        )

marcai/predict.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import argparse
+import numpy as np
+import onnxruntime
+import pandas as pd
+from marcai.utils import load_config
+def sigmoid(x):
+    return 1 / (1 + np.exp(-1 * x))
+def predict_onnx(model_onnx_path, data):
+    ort_session = onnxruntime.InferenceSession(model_onnx_path)
+    x = data.to_numpy(dtype=np.float32)
+    input_name = ort_session.get_inputs()[0].name
+    ort_inputs = {input_name: x}
+    ort_outs = np.array(ort_session.run(None, ort_inputs))
+    ort_outs = sigmoid(ort_outs)
+    return ort_outs
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--input", help="Path to preprocessed data file", required=True
+    )
+    parser.add_argument("-o", "--output", help="Output path", required=True)
+    parser.add_argument(
+        "-m",
+        "--model-dir",
+        help="Directory containing model ONNX and YAML files",
+        required=True,
+    )
+    parser.add_argument(
+        "--chunksize",
+        help="Chunk size for reading and predicting",
+        default=1024,
+        type=int,
+    )
+    args = parser.parse_args()
+    config_path = f"{args.model_dir}/config.yaml"
+    model_onnx = f"{args.model_dir}/model.onnx"
+    config = load_config(config_path)
+    # Load data
+    data = pd.read_csv(args.input, chunksize=args.chunksize)
+    written = False
+    for chunk in data:
+        # Limit columns to model input features
+        input_df = chunk[config["model"]["features"]]
+        prediction = predict_onnx(model_onnx, input_df)
+        # Add prediction to chunk
+        chunk["prediction"] = prediction.squeeze()
+        # Append to CSV
+        if not written:
+            chunk.to_csv(args.output, index=False)
+            written = True
+        else:
+            chunk.to_csv(args.output, mode="a", header=False, index=False)
+if __name__ == "__main__":
+    main()

marcai/process.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import argparse
+import concurrent.futures
+import csv
+import itertools
+import time
+import numpy as np
+import pandas as pd
+from more_itertools import chunked
+import marcai.processing.comparisons as comps
+import marcai.processing.normalizations as norms
+from marcai.utils.parsing import load_records, record_dict
+from multiprocessing import get_context
+def multiprocess_pairs(
+    records_df,
+    pair_indices,
+    chunksize=50000,
+    processes=1,
+):
+    # Create chunked iterator
+    pairs_chunked = chunked(pair_indices, chunksize)
+    # Create processing jobs
+    max_jobs = processes * 2
+    context = get_context("fork")
+    with concurrent.futures.ProcessPoolExecutor(
+        max_workers=processes, mp_context=context
+    ) as executor:
+        futures = set()
+        done = set()
+        first_spawn = True
+        while futures or first_spawn:
+            if first_spawn:
+                spawn_count = max_jobs
+                first_spawn = False
+            else:
+                # Wait for a job to complete
+                done, futures = concurrent.futures.wait(
+                    futures, return_when=concurrent.futures.FIRST_COMPLETED
+                )
+                spawn_count = max_jobs - len(futures)
+                for future in done:
+                    # Get job's output
+                    df  = future.result()
+                    # Yield output
+                    yield df
+            # Spawn jobs
+            for _ in range(spawn_count):
+                pairs_chunk = next(pairs_chunked, None)
+                if pairs_chunk is None:
+                    break
+                indices = np.array(pairs_chunk).astype(int)
+                left_indices = indices[:, 0]
+                right_indices = indices[:, 1]
+                left_records = records_df.iloc[left_indices].reset_index(drop=True)
+                right_records = records_df.iloc[right_indices].reset_index(drop=True)
+                futures.add(executor.submit(process, left_records, right_records))
+def process(df0, df1):
+    normalize_fields = [
+        "author_names",
+        "corporate_names",
+        "meeting_names",
+        "publisher",
+        "title",
+        "title_a",
+        "title_b",
+        "title_c",
+        "title_p",
+    ]
+    # Normalize text fields
+    for field in normalize_fields:
+        df0[field] = norms.lowercase(df0[field])
+        df1[field] = norms.lowercase(df1[field])
+        df0[field] = norms.remove_punctuation(df0[field])
+        df1[field] = norms.remove_punctuation(df1[field])
+        df0[field] = norms.remove_diacritics(df0[field])
+        df1[field] = norms.remove_diacritics(df1[field])
+        df0[field] = norms.normalize_whitespace(df0[field])
+        df1[field] = norms.normalize_whitespace(df1[field])
+    # Compare fields
+    result_df = pd.DataFrame()
+    result_df["id_0"] = df0["id"]
+    result_df["id_1"] = df1["id"]
+    result_df["raw_tokenset"] = comps.token_set_similarity(
+        df0["raw"], df1["raw"], null_value=0.5
+    )
+    # Token sort ratio
+    result_df["publisher"] = comps.token_sort_similarity(
+        df0["publisher"], df1["publisher"], null_value=0.5
+    )
+    author_names = comps.token_sort_similarity(
+        df0["author_names"], df1["author_names"], null_value=np.nan
+    )
+    corporate_names = comps.token_sort_similarity(
+        df0["corporate_names"], df1["corporate_names"], null_value=np.nan
+    )
+    meeting_names = comps.token_sort_similarity(
+        df0["meeting_names"], df1["meeting_names"], null_value=np.nan
+    )
+    authors = pd.concat([author_names, corporate_names, meeting_names], axis=1)
+    # Take max of author comparisons
+    result_df["author"] = comps.maximum(authors, null_value=0.5)
+    # Weighted title comparison
+    weights = {
+        "title_a": 1,
+        "raw": 0,
+        "title_p": 1
+    }
+    result_df["title_agg"] = comps.column_aggregate_similarity(
+        df0[weights.keys()], df1[weights.keys()], weights.values(), null_value=0
+    )
+    # Phonetic difference
+    result_df["title_phonetic"] = comps.phonetic_similarity(
+        df0["title"], df1["title"], null_value=0
+    )
+    # Length difference
+    result_df["title_length"] = comps.length_similarity(
+        df0["title"], df1["title"], null_value=0.5
+    )
+    # Token set similarity
+    result_df["title_tokenset"] = comps.token_set_similarity(
+        df0["title"], df1["title"], null_value=0
+    )
+    # Token sort ratio
+    result_df["title_tokensort"] = comps.token_sort_similarity(
+        df0["title"], df1["title"], null_value=0
+    )
+    # Levenshtein
+    result_df["title_levenshtein"] = comps.levenshtein_similarity(
+        df0["title"], df1["title"], null_value=0
+    )
+    # Jaro
+    result_df["title_jaro"] = comps.jaro_similarity(
+        df0["title"], df1["title"], null_value=0
+    )
+    # Jaro Winkler
+    result_df["title_jaro_winkler"] = comps.jaro_winkler_similarity(
+        df0["title"], df1["title"], null_value=0
+    )
+    # Pagination
+    result_df["pagination"] = comps.pagination_match(
+        df0["pagination"], df1["pagination"], null_value=0.5
+    )
+    # Dates
+    result_df["pub_date"] = comps.year_similarity(
+        df0["pub_date"], df1["pub_date"], null_value=0.5, exp_coeff=0.15
+    )
+    # Pub place
+    result_df["pub_place"] = comps.equal(
+        df0["pub_place"], df1["pub_place"], null_value=0.5
+    )
+    # CID/Label
+    result_df["cid"] = comps.equal(df0["cid"], df1["cid"], null_value=0.5)
+    return result_df
+def parse_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    required = parser.add_argument_group("required arguments")
+    required.add_argument("-i", "--inputs", nargs="+", help="MARC files", required=True)
+    required.add_argument("-o", "--output", help="Output file", required=True)
+    parser.add_argument(
+        "-C",
+        "--chunksize",
+        type=int,
+        help="Number of comparisons per job",
+        default=50000,
+    )
+    parser.add_argument(
+        "-p", "--pair-indices", help="File containing indices of comparisons"
+    )
+    parser.add_argument(
+        "-P",
+        "--processes",
+        type=int,
+        help="Number of processes to run in parallel.",
+        default=1,
+    )
+    return parser.parse_args()
+def main():
+    start = time.time()
+    args = parse_args()
+    # Load records
+    print("Loading records...")
+    records = []
+    for path in args.inputs:
+        records.extend([record_dict(r) for r in load_records(path)])
+    records_df = pd.DataFrame(records)
+    print(f"Loaded {len(records)} records.")
+    print("Processing records...")
+    # Process records
+    written = False
+    with open(args.pair_indices, "r") as indices_file:
+        reader = csv.reader(indices_file)
+        for df in multiprocess_pairs(
+            records_df, reader, args.chunksize, args.processes
+        ):
+            if not written:
+                # Write header
+                df.to_csv(args.output, mode="w", header=True, index=False)
+                written = True
+            else:
+                # Write rows of df to output CSV
+                df.to_csv(args.output, mode="a", header=False, index=False)
+    end = time.time()
+    print(f"Processed {len(records)} records.")
+    print(f"Time elapsed: {end - start:.2f} seconds.")
+if __name__ == "__main__":
+    main()

marcai/processing/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

marcai/processing/comparisons.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import numpy as np
+import re
+import pandas as pd
+from thefuzz import fuzz
+import textdistance
+import fuzzy
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import TfidfVectorizer
+HAND_COUNT_PAGE_PATTERN = re.compile(r"\[(?P<hand_count>\d+)\]\s*p(ages)?[^\w]")
+PAGE_PATTERN = re.compile(r"(?P<pages>\d+)\s*p(ages)?[^\w]")
+def equal(se0, se1, null_value):
+    se0_np = se0.to_numpy(dtype=str)
+    se1_np = se1.to_numpy(dtype=str)
+    col = (se0_np == se1_np).astype(float)
+    se0_nulls = np.argwhere(np.char.strip(se0_np, " ") == "")
+    se1_nulls = np.argwhere(np.char.strip(se1_np, " ") == "")
+    col[se0_nulls] = null_value
+    col[se1_nulls] = null_value
+    return pd.Series(col)
+def maximum(df, null_value, ignore_value=np.nan):
+    df_np = df.to_numpy(dtype=float)
+    df_np[df_np == ignore_value] = np.nan
+    # Mask ignore_value
+    masked = np.ma.masked_invalid(df_np)
+    # Get the max, ignoring NaNs
+    col = np.max(masked, axis=1)
+    # Replace NaNs with null_value
+    col = col.filled(fill_value=null_value)
+    return pd.Series(col)
+def minimum(se0, se1, null_value, ignore_value=np.nan):
+    se0_np = se0.to_numpy(dtype=float)
+    se1_np = se1.to_numpy(dtype=float)
+    # Replace ignore_value with np.nans
+    se0_np[se0_np == ignore_value] = np.nan
+    se1_np[se1_np == ignore_value] = np.nan
+    # Get the min, ignoring NaNs
+    col = np.nanmin(np.stack([se0_np, se1_np], axis=1), axis=1)
+    # Replace NaNs with null_value
+    col[np.isnan(col)] = null_value
+    return pd.Series(col)
+def pagination_match(se0, se1, null_value):
+    def group_values(pat, group, s):
+        return {m.groupdict()[group] for m in pat.finditer(s)}
+    def compare(pag0, pag1):
+        hand_counts0 = group_values(HAND_COUNT_PAGE_PATTERN, "hand_count", pag0)
+        hand_counts1 = group_values(HAND_COUNT_PAGE_PATTERN, "hand_count", pag1)
+        # Remove bracketed digits
+        pag0 = re.sub(r"\[\d+\]", "", pag0)
+        pag1 = re.sub(r"\[\d+\]", " ", pag1)
+        # Remove punctuation
+        pag0 = re.sub(r"[^\w\s]", " ", pag0)
+        pag1 = re.sub(r"[^\w\s]", " ", pag1)
+        # Extract page counts
+        counts0 = group_values(PAGE_PATTERN, "pages", pag0 + " ")
+        counts1 = group_values(PAGE_PATTERN, "pages", pag1 + " ")
+        page_counts0 = counts0 | hand_counts0
+        page_counts1 = counts1 | hand_counts1
+        # Check if any pages are in common.
+        if page_counts0 and page_counts1:
+            for pg0 in page_counts0:
+                for pg1 in page_counts1:
+                    pg0 = int(pg0)
+                    pg1 = int(pg1)
+                    if pg0 == pg1:
+                        return 1.0
+            return 0.0
+        return null_value
+    se0_np = se0.to_numpy(dtype=str)
+    se1_np = se1.to_numpy(dtype=str)
+    col = np.vectorize(compare)(se0_np, se1_np)
+    return pd.Series(col)
+def year_similarity(se0, se1, null_value, exp_coeff):
+    def compare(yr0, yr1):
+        if yr0.isnumeric() and yr1.isnumeric():
+            x = abs(int(yr0) - int(yr1))
+            # Sigmoid where x = 0, y = 1, tail to the right
+            return 2 / (1 + np.exp(exp_coeff * x))
+        return null_value
+    se0_np = se0.to_numpy(dtype=str)
+    se1_np = se1.to_numpy(dtype=str)
+    return np.vectorize(compare)(se0_np, se1_np)
+def column_aggregate_similarity(df0, df1, column_weights, null_value):
+    weights_dict = {k: v for k, v in zip(df0.columns, column_weights)}
+    def get_word_weights(row):
+        word_weights = {}
+        for i, value in enumerate(row):
+            column = df0.columns[i]
+            if column in weights_dict:
+                current_weight = weights_dict[column]
+            else:
+                current_weight = 0
+            for w in value.split():
+                if w not in word_weights:
+                    word_weights[w] = current_weight
+                else:
+                    word_weights[w] = max(current_weight, word_weights[w])
+        return word_weights
+    def compare(row0, row1):
+        weights0 = get_word_weights(row0)
+        weights1 = get_word_weights(row1)
+        total_weight = 0
+        missing_weight = 0
+        for w in weights0:
+            weight = weights0[w]
+            if w not in weights1:
+                missing_weight += weights0[w]
+            else:
+                weight = max(weight, weights1[w])
+            total_weight += weight
+        for w in weights1:
+            weight = weights1[w]
+            if w not in weights0:
+                missing_weight += weights1[w]
+            else:
+                weight = max(weight, weights0[w])
+            total_weight += weight
+        if total_weight == 0:
+            return null_value
+        return float((total_weight - missing_weight) / total_weight)
+    if df0.columns.to_list() != df1.columns.to_list():
+        raise ValueError("DataFrames must have the same columns")
+    # Run compare on rows of each df
+    col = np.array(
+        [compare(row0, row1) for row0, row1 in zip(df0.to_numpy(), df1.to_numpy())]
+    )
+    return pd.Series(col)
+def length_similarity(se0, se1, null_value):
+    se0_np = se0.to_numpy(dtype=str)
+    se1_np = se1.to_numpy(dtype=str)
+    col = np.array([1 - abs(len(s0) - len(s1)) / max(len(s0), len(s1)) for s0, s1 in zip(se0_np, se1_np)])
+    # If either string is empty, set similarity to null_value
+    col[(se0_np == "") | (se1_np == "")] = null_value
+    return pd.Series(col)
+def phonetic_similarity(se0, se1, null_value):
+    soundex = fuzzy.Soundex(4)
+    se0_np = se0.to_numpy(dtype=str)
+    se1_np = se1.to_numpy(dtype=str)
+    def compare_words(str0, str1):
+        words0 = str0.split()
+        words1 = str1.split()
+        sounds0 = [soundex(word) for word in words0]
+        sounds1 = [soundex(word) for word in words1]
+        return sum(s0 == s1 for s0, s1 in zip(sounds0, sounds1)) / max(len(sounds0), len(sounds1))
+    col = np.vectorize(compare_words)(se0_np, se1_np)
+    return pd.Series(col)
+def jaccard_similarity(se0, se1, null_value):
+    se0_np = se0.to_numpy(dtype=str)
+    se1_np = se1.to_numpy(dtype=str)
+    col = np.array([textdistance.jaccard.normalized_similarity(set(s0.split()), set(s1.split())) for s0, s1 in zip(se0_np, se1_np)])
+    # If either string is empty, set similarity to null_value
+    col[(se0_np == "") | (se1_np == "")] = null_value
+    return pd.Series(col)
+def similarity_factory(similarity_function):
+    def similarity(se0, se1, null_value):
+        se0_np = se0.to_numpy(dtype=str)
+        se1_np = se1.to_numpy(dtype=str)
+        col = np.vectorize(similarity_function)(se0_np, se1_np)
+        # Replace original null values with null_value
+        col[se0_np == ""] = null_value
+        col[se0_np == ""] = null_value
+        return pd.Series(col)
+    return similarity
+token_set_similarity = similarity_factory(
+    lambda s0, s1: fuzz.token_set_ratio(s0, s1) / 100
+)
+token_sort_similarity = similarity_factory(
+    lambda s0, s1: fuzz.token_sort_ratio(s0, s1) / 100
+)
+levenshtein_similarity = similarity_factory(lambda s0, s1: (fuzz.ratio(s0, s1) / 100))
+jaro_winkler_similarity = similarity_factory(lambda s0, s1: textdistance.jaro_winkler.similarity(s0, s1))
+jaro_similarity = similarity_factory(lambda s0, s1: textdistance.jaro.similarity(s0, s1))

marcai/processing/normalizations.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from unidecode import unidecode
+import numpy as np
+import pandas as pd
+def remove_diacritics(series):
+    se_np = series.to_numpy()
+    se_np = np.vectorize(unidecode)(se_np)
+    return pd.Series(se_np)
+def lowercase(series):
+    return series.str.lower()
+def remove_punctuation(series):
+    return series.str.replace(r"[^\w\s]", "")
+def normalize_whitespace(series):
+    # Replace all whitespace with a single space
+    s = series.str.replace(r"\s", " ")
+    # Remove leading and trailing whitespace
+    s = s.str.strip()
+    # Remove double spaces
+    return s.str.replace(r"\s+", " ")
+def substring(series, start, end):
+    return series.str[start:end]
+def apply_normalizers(series, transforms):
+    for transform in transforms:
+        series = transform(series)
+    return series

marcai/train.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import pytorch_lightning as lightning
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
+import warnings
+import yaml
+import argparse
+import os
+import torch
+from marcai.pl import MARCDataModule, SimilarityVectorModel
+from marcai.utils import load_config
+import tarfile
+def train(name=None):
+    config_path = "config.yaml"
+    config = load_config(config_path)
+    model_config = load_config(config_path)["model"]
+    # Create data module from processed data
+    warnings.filterwarnings("ignore", ".*does not have many workers.*")
+    data = MARCDataModule(
+        model_config["train_processed_path"],
+        model_config["val_processed_path"],
+        model_config["test_processed_path"],
+        model_config["features"],
+        model_config["batch_size"],
+    )
+    # Create model
+    model = SimilarityVectorModel(
+        model_config["lr"],
+        model_config["weight_decay"],
+        model_config["optimizer"],
+        model_config["batch_size"],
+        model_config["features"],
+        model_config["hidden_sizes"],
+    )
+    save_dir = os.path.join(model_config["saved_models_dir"], name)
+    os.makedirs(save_dir, exist_ok=True)
+    # Save best models
+    checkpoint_callback = ModelCheckpoint(
+        monitor="val_acc", mode="max", dirpath=save_dir, filename="model"
+    )
+    callbacks = [checkpoint_callback]
+    if model_config["patience"] != -1:
+        early_stop_callback = EarlyStopping(
+            monitor="val_acc",
+            min_delta=0.00,
+            patience=model_config["patience"],
+            verbose=False,
+            mode="max",
+        )
+        callbacks.append(early_stop_callback)
+    trainer = lightning.Trainer(
+        max_epochs=model_config["max_epochs"], callbacks=callbacks, accelerator="cpu"
+    )
+    trainer.fit(model, data)
+    # Save ONNX
+    onnx_path = os.path.join(save_dir, "model.onnx")
+    input_sample = torch.randn((1, len(model.attrs)))
+    torch.onnx.export(
+        model,
+        input_sample,
+        onnx_path,
+        export_params=True,
+        do_constant_folding=True,
+        input_names=["input"],
+        output_names=["output"],
+        dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
+    )
+    # Save config
+    config_filename = os.path.join(save_dir, "config.yaml")
+    with open(config_filename, "w") as f:
+        dump = yaml.dump(config)
+        f.write(dump)
+    # Compress model directory files
+    tar_path = f"{save_dir}/{name}.tar.gz"
+    with tarfile.open(tar_path, mode="w:gz") as archive:
+        archive.add(save_dir, arcname=os.path.basename(save_dir))
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-n", "--run-name", help="Name for training run"
+    )
+    args = parser.parse_args()
+    train(args.run_name)
+if __name__ == "__main__":
+    main()

marcai/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .load_config import load_config

marcai/utils/load_config.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import yaml
+def load_config(filename):
+    with open(filename, 'r') as file:
+        return yaml.safe_load(file)

marcai/utils/parsing.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from collections import OrderedDict
+import pymarc
+def get_record_values(record, location):
+    split = location.split("$")
+    if len(split) == 1:
+        tag = split[0]
+        code = None
+    elif len(split) == 2:
+        tag, code = split
+    else:
+        raise ValueError("Invalid location")
+    # Find fields matching tag
+    fields = record.get_fields(tag)
+    results = []
+    for current_value in fields:
+        if current_value is not None:
+            if code is not None:
+                values = current_value.get_subfields(code)
+                results.extend(values)
+            elif isinstance(current_value, pymarc.Field):
+                results.append(current_value.value())
+    return " ".join(results)
+def record_dict(record):
+    d = OrderedDict()
+    # Dump every field value into a string
+    d["raw"] = " ".join([f.value() for f in record.fields])
+    d["cid"] = get_record_values(record, "CID")
+    d["id"] = get_record_values(record, "001")
+    fixed_data = get_record_values(record, "008")
+    d["pub_date"] = fixed_data[7:11]
+    d["pub_place"] = fixed_data[15:18]
+    d["language"] = fixed_data[35:38]
+    d["title_a"] = get_record_values(record, "245$a")
+    d["title_b"] = get_record_values(record, "245$b")
+    d["title_c"] = get_record_values(record, "245$c")
+    d["title_p"] = get_record_values(record, "245$p")
+    d["title"] = " ".join([d["title_a"], d["title_b"], d["title_p"]])
+    d["title_variation_a"] = get_record_values(record, "246$a")
+    d["title_variation_b"] = get_record_values(record, "246$b")
+    d["subject_headings"] = " ".join(
+        get_record_values(record, "650$a") + get_record_values(record, "650$x")
+    )
+    d["author_names"] = " ".join(
+        [get_record_values(record, "100$a"), get_record_values(record, "700$a")]
+    )
+    d["corporate_names"] = " ".join(
+        [get_record_values(record, "110$a"), get_record_values(record, "710$a")]
+    )
+    d["meeting_names"] = " ".join(
+        [get_record_values(record, "111$a"), get_record_values(record, "711$a")]
+    )
+    d["publisher"] = record.publisher or ""
+    d["pagination"] = get_record_values(record, "300$a")
+    d["dimensions"] = get_record_values(record, "300$c")
+    return d
+def load_records(path):
+    records = []
+    extension = path.split(".")[-1]
+    if extension == "mrc" or extension == "marc":
+        with open(path, "rb") as marcfile:
+            reader = pymarc.MARCReader(marcfile)
+            records.extend(list(reader))
+    elif extension == "json":
+        with open(path, "r") as jsonfile:
+            for line in jsonfile:
+                record = pymarc.parse_json_to_array(line)[0]
+                records.append(record)
+    else:
+        raise ValueError(f"Unsupported file extension: {extension}")
+    return records

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pymarc
+thefuzz
+pandas
+unidecode
+python-levenshtein
+onnxruntime
+textdistance
+more-itertools
+pyyaml
+onnx
+tqdm