Spaces:

cdlib
/

marc-match-ai-demo

Sleeping

RvanB commited on Apr 24

Commit

a85bc9a

•

1 Parent(s): fbf7e95

Implement gradio demo

Files changed (5) hide show

app.py ADDED Viewed

+import gradio as gr
+import pymarc
+from marcai.process import process
+from marcai.utils.parsing import record_dict
+import pandas as pd
+from marcai.predict import predict_onnx
+from marcai.utils import load_config
+def compare(file1, file2):
+    record1 = pymarc.parse_xml_to_array(file1)[0]
+    record2 = pymarc.parse_xml_to_array(file2)[0]
+    df1 = pd.DataFrame.from_dict([record_dict(record1)])
+    df2 = pd.DataFrame.from_dict([record_dict(record2)])
+    df = process(df1, df2)
+    # Load model config
+    config = load_config("config.yaml")
+    model_onnx = "model.onnx"
+    # Run ONNX model
+    input_df = df[config["model"]["features"]]
+    prediction = predict_onnx(model_onnx, input_df)
+    prediction = prediction.item()
+    return {"match": prediction, "not match": 1 - prediction}
+interface = gr.Interface(
+    fn=compare,
+    inputs=[
+        gr.File(label="MARC XML File 1"),
+        gr.File(label="MARC XML File 2")
+    ],
+    outputs=gr.Label(label="Classification"),
+    title="MARC Record Matcher",
+    description="Upload two MARC XML files with one record each.",
+    allow_flagging="never"
+)
+interface.launch()

config.yaml CHANGED Viewed

@@ -1,31 +1,22 @@
 model:
-  # Inputs features
   features:
-    - title_tokenset
-    - title_agg
-    - author
-    - publisher
-    - pub_date
-    - pub_place
-    - pagination
-  # Size of hidden layers
   hidden_sizes:
-    - 32
-    - 64
-  # Training
-  batch_size: 512
-  weight_decay: 0.0
-  max_epochs: -1
-  # Disable early stopping with -1
-  patience: 20
   lr: 0.006
   optimizer: Adam
   saved_models_dir: saved_models
-  # Paths to dataset splits
   test_processed_path: data/202303_goldfinch_set_1.1/processed/test_processed.csv
   train_processed_path: data/202303_goldfinch_set_1.1/processed/train_processed.csv
   val_processed_path: data/202303_goldfinch_set_1.1/processed/val_processed.csv

 model:
+  batch_size: 512
   features:
+  - title_tokenset
+  - title_agg
+  - author
+  - publisher
+  - pub_date
+  - pub_place
+  - pagination
   hidden_sizes:
+  - 32
+  - 64
   lr: 0.006
+  max_epochs: -1
   optimizer: Adam
+  patience: 20
   saved_models_dir: saved_models
   test_processed_path: data/202303_goldfinch_set_1.1/processed/test_processed.csv
   train_processed_path: data/202303_goldfinch_set_1.1/processed/train_processed.csv
   val_processed_path: data/202303_goldfinch_set_1.1/processed/val_processed.csv
+  weight_decay: 0.0

marcai/process.py CHANGED Viewed

@@ -109,7 +109,6 @@ def process(df0, df1):
         df0["raw"], df1["raw"], null_value=0.5
     )
     # Token sort ratio
     result_df["publisher"] = comps.token_sort_similarity(
         df0["publisher"], df1["publisher"], null_value=0.5
@@ -140,11 +139,6 @@ def process(df0, df1):
         df0[weights.keys()], df1[weights.keys()], weights.values(), null_value=0
     )
-    # Phonetic difference
-    result_df["title_phonetic"] = comps.phonetic_similarity(
-        df0["title"], df1["title"], null_value=0
-    )
     # Length difference
     result_df["title_length"] = comps.length_similarity(
         df0["title"], df1["title"], null_value=0.5

         df0["raw"], df1["raw"], null_value=0.5
     )
     # Token sort ratio
     result_df["publisher"] = comps.token_sort_similarity(
         df0["publisher"], df1["publisher"], null_value=0.5
         df0[weights.keys()], df1[weights.keys()], weights.values(), null_value=0
     )
     # Length difference
     result_df["title_length"] = comps.length_similarity(
         df0["title"], df1["title"], null_value=0.5

marcai/processing/comparisons.py CHANGED Viewed

@@ -3,9 +3,6 @@ import re
 import pandas as pd
 from thefuzz import fuzz
 import textdistance
-import fuzzy
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.feature_extraction.text import TfidfVectorizer
@@ -190,25 +187,6 @@ def length_similarity(se0, se1, null_value):
     return pd.Series(col)
-def phonetic_similarity(se0, se1, null_value):
-    soundex = fuzzy.Soundex(4)
-    se0_np = se0.to_numpy(dtype=str)
-    se1_np = se1.to_numpy(dtype=str)
-    def compare_words(str0, str1):
-        words0 = str0.split()
-        words1 = str1.split()
-        sounds0 = [soundex(word) for word in words0]
-        sounds1 = [soundex(word) for word in words1]
-        return sum(s0 == s1 for s0, s1 in zip(sounds0, sounds1)) / max(len(sounds0), len(sounds1))
-    col = np.vectorize(compare_words)(se0_np, se1_np)
-    return pd.Series(col)
 def jaccard_similarity(se0, se1, null_value):
     se0_np = se0.to_numpy(dtype=str)

 import pandas as pd
 from thefuzz import fuzz
 import textdistance
     return pd.Series(col)
 def jaccard_similarity(se0, se1, null_value):
     se0_np = se0.to_numpy(dtype=str)

model.onnx ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a549a29ebb618819a227d9568e8c1a6555e4f6407c3b4031a9170f4746ecdde
+size 10669