RvanB commited on
Commit
a85bc9a
1 Parent(s): fbf7e95

Implement gradio demo

Browse files
Files changed (5) hide show
  1. app.py +42 -0
  2. config.yaml +13 -22
  3. marcai/process.py +0 -6
  4. marcai/processing/comparisons.py +0 -22
  5. model.onnx +3 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pymarc
3
+ from marcai.process import process
4
+ from marcai.utils.parsing import record_dict
5
+ import pandas as pd
6
+ from marcai.predict import predict_onnx
7
+ from marcai.utils import load_config
8
+
9
+ def compare(file1, file2):
10
+ record1 = pymarc.parse_xml_to_array(file1)[0]
11
+ record2 = pymarc.parse_xml_to_array(file2)[0]
12
+
13
+ df1 = pd.DataFrame.from_dict([record_dict(record1)])
14
+ df2 = pd.DataFrame.from_dict([record_dict(record2)])
15
+
16
+ df = process(df1, df2)
17
+
18
+ # Load model config
19
+ config = load_config("config.yaml")
20
+ model_onnx = "model.onnx"
21
+
22
+ # Run ONNX model
23
+ input_df = df[config["model"]["features"]]
24
+ prediction = predict_onnx(model_onnx, input_df)
25
+
26
+ prediction = prediction.item()
27
+
28
+ return {"match": prediction, "not match": 1 - prediction}
29
+
30
+
31
+ interface = gr.Interface(
32
+ fn=compare,
33
+ inputs=[
34
+ gr.File(label="MARC XML File 1"),
35
+ gr.File(label="MARC XML File 2")
36
+ ],
37
+ outputs=gr.Label(label="Classification"),
38
+ title="MARC Record Matcher",
39
+ description="Upload two MARC XML files with one record each.",
40
+ allow_flagging="never"
41
+ )
42
+ interface.launch()
config.yaml CHANGED
@@ -1,31 +1,22 @@
1
  model:
2
- # Inputs features
3
  features:
4
- - title_tokenset
5
- - title_agg
6
- - author
7
- - publisher
8
- - pub_date
9
- - pub_place
10
- - pagination
11
- # Size of hidden layers
12
  hidden_sizes:
13
- - 32
14
- - 64
15
-
16
- # Training
17
- batch_size: 512
18
- weight_decay: 0.0
19
- max_epochs: -1
20
-
21
- # Disable early stopping with -1
22
- patience: 20
23
-
24
  lr: 0.006
 
25
  optimizer: Adam
 
26
  saved_models_dir: saved_models
27
-
28
- # Paths to dataset splits
29
  test_processed_path: data/202303_goldfinch_set_1.1/processed/test_processed.csv
30
  train_processed_path: data/202303_goldfinch_set_1.1/processed/train_processed.csv
31
  val_processed_path: data/202303_goldfinch_set_1.1/processed/val_processed.csv
 
 
1
  model:
2
+ batch_size: 512
3
  features:
4
+ - title_tokenset
5
+ - title_agg
6
+ - author
7
+ - publisher
8
+ - pub_date
9
+ - pub_place
10
+ - pagination
 
11
  hidden_sizes:
12
+ - 32
13
+ - 64
 
 
 
 
 
 
 
 
 
14
  lr: 0.006
15
+ max_epochs: -1
16
  optimizer: Adam
17
+ patience: 20
18
  saved_models_dir: saved_models
 
 
19
  test_processed_path: data/202303_goldfinch_set_1.1/processed/test_processed.csv
20
  train_processed_path: data/202303_goldfinch_set_1.1/processed/train_processed.csv
21
  val_processed_path: data/202303_goldfinch_set_1.1/processed/val_processed.csv
22
+ weight_decay: 0.0
marcai/process.py CHANGED
@@ -109,7 +109,6 @@ def process(df0, df1):
109
  df0["raw"], df1["raw"], null_value=0.5
110
  )
111
 
112
-
113
  # Token sort ratio
114
  result_df["publisher"] = comps.token_sort_similarity(
115
  df0["publisher"], df1["publisher"], null_value=0.5
@@ -140,11 +139,6 @@ def process(df0, df1):
140
  df0[weights.keys()], df1[weights.keys()], weights.values(), null_value=0
141
  )
142
 
143
- # Phonetic difference
144
- result_df["title_phonetic"] = comps.phonetic_similarity(
145
- df0["title"], df1["title"], null_value=0
146
- )
147
-
148
  # Length difference
149
  result_df["title_length"] = comps.length_similarity(
150
  df0["title"], df1["title"], null_value=0.5
 
109
  df0["raw"], df1["raw"], null_value=0.5
110
  )
111
 
 
112
  # Token sort ratio
113
  result_df["publisher"] = comps.token_sort_similarity(
114
  df0["publisher"], df1["publisher"], null_value=0.5
 
139
  df0[weights.keys()], df1[weights.keys()], weights.values(), null_value=0
140
  )
141
 
 
 
 
 
 
142
  # Length difference
143
  result_df["title_length"] = comps.length_similarity(
144
  df0["title"], df1["title"], null_value=0.5
marcai/processing/comparisons.py CHANGED
@@ -3,9 +3,6 @@ import re
3
  import pandas as pd
4
  from thefuzz import fuzz
5
  import textdistance
6
- import fuzzy
7
- from sklearn.metrics.pairwise import cosine_similarity
8
- from sklearn.feature_extraction.text import TfidfVectorizer
9
 
10
 
11
 
@@ -190,25 +187,6 @@ def length_similarity(se0, se1, null_value):
190
 
191
  return pd.Series(col)
192
 
193
- def phonetic_similarity(se0, se1, null_value):
194
- soundex = fuzzy.Soundex(4)
195
-
196
- se0_np = se0.to_numpy(dtype=str)
197
- se1_np = se1.to_numpy(dtype=str)
198
-
199
- def compare_words(str0, str1):
200
- words0 = str0.split()
201
- words1 = str1.split()
202
-
203
- sounds0 = [soundex(word) for word in words0]
204
- sounds1 = [soundex(word) for word in words1]
205
-
206
- return sum(s0 == s1 for s0, s1 in zip(sounds0, sounds1)) / max(len(sounds0), len(sounds1))
207
-
208
- col = np.vectorize(compare_words)(se0_np, se1_np)
209
-
210
- return pd.Series(col)
211
-
212
 
213
  def jaccard_similarity(se0, se1, null_value):
214
  se0_np = se0.to_numpy(dtype=str)
 
3
  import pandas as pd
4
  from thefuzz import fuzz
5
  import textdistance
 
 
 
6
 
7
 
8
 
 
187
 
188
  return pd.Series(col)
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  def jaccard_similarity(se0, se1, null_value):
192
  se0_np = se0.to_numpy(dtype=str)
model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a549a29ebb618819a227d9568e8c1a6555e4f6407c3b4031a9170f4746ecdde
3
+ size 10669