Implement gradio demo
Browse files- app.py +42 -0
- config.yaml +13 -22
- marcai/process.py +0 -6
- marcai/processing/comparisons.py +0 -22
- model.onnx +3 -0
app.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pymarc
|
3 |
+
from marcai.process import process
|
4 |
+
from marcai.utils.parsing import record_dict
|
5 |
+
import pandas as pd
|
6 |
+
from marcai.predict import predict_onnx
|
7 |
+
from marcai.utils import load_config
|
8 |
+
|
9 |
+
def compare(file1, file2):
|
10 |
+
record1 = pymarc.parse_xml_to_array(file1)[0]
|
11 |
+
record2 = pymarc.parse_xml_to_array(file2)[0]
|
12 |
+
|
13 |
+
df1 = pd.DataFrame.from_dict([record_dict(record1)])
|
14 |
+
df2 = pd.DataFrame.from_dict([record_dict(record2)])
|
15 |
+
|
16 |
+
df = process(df1, df2)
|
17 |
+
|
18 |
+
# Load model config
|
19 |
+
config = load_config("config.yaml")
|
20 |
+
model_onnx = "model.onnx"
|
21 |
+
|
22 |
+
# Run ONNX model
|
23 |
+
input_df = df[config["model"]["features"]]
|
24 |
+
prediction = predict_onnx(model_onnx, input_df)
|
25 |
+
|
26 |
+
prediction = prediction.item()
|
27 |
+
|
28 |
+
return {"match": prediction, "not match": 1 - prediction}
|
29 |
+
|
30 |
+
|
31 |
+
interface = gr.Interface(
|
32 |
+
fn=compare,
|
33 |
+
inputs=[
|
34 |
+
gr.File(label="MARC XML File 1"),
|
35 |
+
gr.File(label="MARC XML File 2")
|
36 |
+
],
|
37 |
+
outputs=gr.Label(label="Classification"),
|
38 |
+
title="MARC Record Matcher",
|
39 |
+
description="Upload two MARC XML files with one record each.",
|
40 |
+
allow_flagging="never"
|
41 |
+
)
|
42 |
+
interface.launch()
|
config.yaml
CHANGED
@@ -1,31 +1,22 @@
|
|
1 |
model:
|
2 |
-
|
3 |
features:
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
# Size of hidden layers
|
12 |
hidden_sizes:
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
# Training
|
17 |
-
batch_size: 512
|
18 |
-
weight_decay: 0.0
|
19 |
-
max_epochs: -1
|
20 |
-
|
21 |
-
# Disable early stopping with -1
|
22 |
-
patience: 20
|
23 |
-
|
24 |
lr: 0.006
|
|
|
25 |
optimizer: Adam
|
|
|
26 |
saved_models_dir: saved_models
|
27 |
-
|
28 |
-
# Paths to dataset splits
|
29 |
test_processed_path: data/202303_goldfinch_set_1.1/processed/test_processed.csv
|
30 |
train_processed_path: data/202303_goldfinch_set_1.1/processed/train_processed.csv
|
31 |
val_processed_path: data/202303_goldfinch_set_1.1/processed/val_processed.csv
|
|
|
|
1 |
model:
|
2 |
+
batch_size: 512
|
3 |
features:
|
4 |
+
- title_tokenset
|
5 |
+
- title_agg
|
6 |
+
- author
|
7 |
+
- publisher
|
8 |
+
- pub_date
|
9 |
+
- pub_place
|
10 |
+
- pagination
|
|
|
11 |
hidden_sizes:
|
12 |
+
- 32
|
13 |
+
- 64
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
lr: 0.006
|
15 |
+
max_epochs: -1
|
16 |
optimizer: Adam
|
17 |
+
patience: 20
|
18 |
saved_models_dir: saved_models
|
|
|
|
|
19 |
test_processed_path: data/202303_goldfinch_set_1.1/processed/test_processed.csv
|
20 |
train_processed_path: data/202303_goldfinch_set_1.1/processed/train_processed.csv
|
21 |
val_processed_path: data/202303_goldfinch_set_1.1/processed/val_processed.csv
|
22 |
+
weight_decay: 0.0
|
marcai/process.py
CHANGED
@@ -109,7 +109,6 @@ def process(df0, df1):
|
|
109 |
df0["raw"], df1["raw"], null_value=0.5
|
110 |
)
|
111 |
|
112 |
-
|
113 |
# Token sort ratio
|
114 |
result_df["publisher"] = comps.token_sort_similarity(
|
115 |
df0["publisher"], df1["publisher"], null_value=0.5
|
@@ -140,11 +139,6 @@ def process(df0, df1):
|
|
140 |
df0[weights.keys()], df1[weights.keys()], weights.values(), null_value=0
|
141 |
)
|
142 |
|
143 |
-
# Phonetic difference
|
144 |
-
result_df["title_phonetic"] = comps.phonetic_similarity(
|
145 |
-
df0["title"], df1["title"], null_value=0
|
146 |
-
)
|
147 |
-
|
148 |
# Length difference
|
149 |
result_df["title_length"] = comps.length_similarity(
|
150 |
df0["title"], df1["title"], null_value=0.5
|
|
|
109 |
df0["raw"], df1["raw"], null_value=0.5
|
110 |
)
|
111 |
|
|
|
112 |
# Token sort ratio
|
113 |
result_df["publisher"] = comps.token_sort_similarity(
|
114 |
df0["publisher"], df1["publisher"], null_value=0.5
|
|
|
139 |
df0[weights.keys()], df1[weights.keys()], weights.values(), null_value=0
|
140 |
)
|
141 |
|
|
|
|
|
|
|
|
|
|
|
142 |
# Length difference
|
143 |
result_df["title_length"] = comps.length_similarity(
|
144 |
df0["title"], df1["title"], null_value=0.5
|
marcai/processing/comparisons.py
CHANGED
@@ -3,9 +3,6 @@ import re
|
|
3 |
import pandas as pd
|
4 |
from thefuzz import fuzz
|
5 |
import textdistance
|
6 |
-
import fuzzy
|
7 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
|
10 |
|
11 |
|
@@ -190,25 +187,6 @@ def length_similarity(se0, se1, null_value):
|
|
190 |
|
191 |
return pd.Series(col)
|
192 |
|
193 |
-
def phonetic_similarity(se0, se1, null_value):
|
194 |
-
soundex = fuzzy.Soundex(4)
|
195 |
-
|
196 |
-
se0_np = se0.to_numpy(dtype=str)
|
197 |
-
se1_np = se1.to_numpy(dtype=str)
|
198 |
-
|
199 |
-
def compare_words(str0, str1):
|
200 |
-
words0 = str0.split()
|
201 |
-
words1 = str1.split()
|
202 |
-
|
203 |
-
sounds0 = [soundex(word) for word in words0]
|
204 |
-
sounds1 = [soundex(word) for word in words1]
|
205 |
-
|
206 |
-
return sum(s0 == s1 for s0, s1 in zip(sounds0, sounds1)) / max(len(sounds0), len(sounds1))
|
207 |
-
|
208 |
-
col = np.vectorize(compare_words)(se0_np, se1_np)
|
209 |
-
|
210 |
-
return pd.Series(col)
|
211 |
-
|
212 |
|
213 |
def jaccard_similarity(se0, se1, null_value):
|
214 |
se0_np = se0.to_numpy(dtype=str)
|
|
|
3 |
import pandas as pd
|
4 |
from thefuzz import fuzz
|
5 |
import textdistance
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
|
|
|
187 |
|
188 |
return pd.Series(col)
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
def jaccard_similarity(se0, se1, null_value):
|
192 |
se0_np = se0.to_numpy(dtype=str)
|
model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a549a29ebb618819a227d9568e8c1a6555e4f6407c3b4031a9170f4746ecdde
|
3 |
+
size 10669
|