Feliks Zaslavskiy commited on
Commit
0c1e501
1 Parent(s): ce71282

small updates

Browse files
Files changed (3) hide show
  1. app.py +15 -10
  2. data_set_training.csv +3 -0
  3. quick_evaluate.py +1 -1
app.py CHANGED
@@ -2,8 +2,8 @@ import math
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
- import torch
6
- from transformers import AlbertTokenizer, AlbertModel
7
  from sentence_transformers import SentenceTransformer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  from io import BytesIO
@@ -14,7 +14,7 @@ from io import BytesIO
14
  #model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
15
 
16
  # For baseline 'sentence-transformers/paraphrase-albert-base-v2'
17
- model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_23-47-34'
18
 
19
  similarity_threshold = 0.9
20
 
@@ -60,12 +60,16 @@ if uploaded_file is not None:
60
  data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
61
 
62
  # Data cleaning CAQH
63
- data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
 
 
 
64
  data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
65
  + np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
66
  + data_caqh['city'].astype(str) + ', '\
67
  + data_caqh['state'].astype(str) + ' ' \
68
  + data_caqh['postalcode'].astype(str)
 
69
 
70
  st.write(f"CAQH before duplicate removal {len(data_caqh)}")
71
  data_caqh.drop_duplicates(subset='full-addr',inplace=True)
@@ -73,15 +77,16 @@ if uploaded_file is not None:
73
  st.write(f"CAQH after duplicate removal {len(data_caqh)}")
74
 
75
  # Data cleaning NDB
76
- data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
77
 
78
- data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
79
- np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
80
- + data_ndb['zip_pls_4_cd'].astype(str))
81
 
82
  data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
83
- + data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
84
- + data_ndb['st_cd'].astype(str) + ' ' + data_ndb['zip_cd_zip_pls_4_cd']
 
85
 
86
  # Calculate similarity For CAQH
87
  num_items = len(data_caqh)
 
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
+ #import torch
6
+ #from transformers import AlbertTokenizer, AlbertModel
7
  from sentence_transformers import SentenceTransformer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  from io import BytesIO
 
14
  #model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
15
 
16
  # For baseline 'sentence-transformers/paraphrase-albert-base-v2'
17
+ model_name = 'output/training_OnlineConstrativeLoss-2023-03-14_00-40-03'
18
 
19
  similarity_threshold = 0.9
20
 
 
60
  data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
61
 
62
  # Data cleaning CAQH
63
+ # if you need to format with 00000-0000
64
+ # lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x
65
+ data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5])
66
+
67
  data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
68
  + np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
69
  + data_caqh['city'].astype(str) + ', '\
70
  + data_caqh['state'].astype(str) + ' ' \
71
  + data_caqh['postalcode'].astype(str)
72
+ data_caqh['full-addr'] = data_caqh['full-addr'].str.upper()
73
 
74
  st.write(f"CAQH before duplicate removal {len(data_caqh)}")
75
  data_caqh.drop_duplicates(subset='full-addr',inplace=True)
 
77
  st.write(f"CAQH after duplicate removal {len(data_caqh)}")
78
 
79
  # Data cleaning NDB
80
+ #data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
81
 
82
+ #data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
83
+ # np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
84
+ # + data_ndb['zip_pls_4_cd'].astype(str))
85
 
86
  data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
87
+ + data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
88
+ + data_ndb['st_cd'].astype(str) + ' ' + data_ndb['zip_cd'].astype(str)
89
+ data_ndb['full-addr'] = data_ndb['full-addr'].str.upper()
90
 
91
  # Calculate similarity For CAQH
92
  num_items = len(data_caqh)
data_set_training.csv CHANGED
@@ -65,12 +65,15 @@ ADDRESS1|ADDRESS2|ARE_SAME
65
  145 34 23TH ST, JACKSONVILLE, FL 32258|145-50 23TH ST, JACKSONVILLE, FL 32258|0
66
  145-12 23TH ST, JACKSONVILLE, FL 32258|145 29 23TH ST, JACKSONVILLE, FL 32258|0
67
  15 49 RT 9, HALFMOON, NY 12065|15-49 RT 9, HALFMOON, NY 12065|1
 
68
  15 49 RT 9, HALFMOON, NY 12065|15-59 RT 9, HALFMOON, NY 12065|0
 
69
  15 49 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
70
  152 13 GOLD STAR HWY, GROTON, CT 63403|152-18 GOLD STAR HWY, GROTON, CT 63403|0
71
  152 43 GOLD STAR HWY, GROTON, CT 63403|152-43 GOLD STAR HWY, GROTON, CT 63403|0
72
  152 43 GOLD STAR HWY, GROTON, CT 63403|152-44 GOLD STAR HWY, GROTON, CT 63403|0
73
  154-9 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
 
74
  160-10 N MIDLAND AVE, NYACK, NY 10960|160 10 NORTH MIDLAND AVENUE, NYACK, NY 10960|1
75
  160-10 N MIDLAND AVE, NYACK, NY 10960|160 20 NORTH MIDLAND AVE, NYACK, NY 10960|0
76
  160-10 N MIDLAND AVE, NYACK, NY 10960|160-10 N MIDLAND AVENUE, NYACK, NY 10960|1
 
65
  145 34 23TH ST, JACKSONVILLE, FL 32258|145-50 23TH ST, JACKSONVILLE, FL 32258|0
66
  145-12 23TH ST, JACKSONVILLE, FL 32258|145 29 23TH ST, JACKSONVILLE, FL 32258|0
67
  15 49 RT 9, HALFMOON, NY 12065|15-49 RT 9, HALFMOON, NY 12065|1
68
+ 15 49 RT 9, HALFMOON, NY 12065|15-49 ROUTE 9, HALFMOON, NY 12065|1
69
  15 49 RT 9, HALFMOON, NY 12065|15-59 RT 9, HALFMOON, NY 12065|0
70
+ 15 49 RT 9, HALFMOON, NY 12065|15-59 ROUTE 9, HALFMOON, NY 12065|0
71
  15 49 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
72
  152 13 GOLD STAR HWY, GROTON, CT 63403|152-18 GOLD STAR HWY, GROTON, CT 63403|0
73
  152 43 GOLD STAR HWY, GROTON, CT 63403|152-43 GOLD STAR HWY, GROTON, CT 63403|0
74
  152 43 GOLD STAR HWY, GROTON, CT 63403|152-44 GOLD STAR HWY, GROTON, CT 63403|0
75
  154-9 RT 9, HALFMOON, NY 12065|1549 RT 9, HALFMOON, NY 12065|1
76
+ 154-9 RT 9, HALFMOON, NY 12065|1549 ROUTE 9, HALFMOON, NY 12065|1
77
  160-10 N MIDLAND AVE, NYACK, NY 10960|160 10 NORTH MIDLAND AVENUE, NYACK, NY 10960|1
78
  160-10 N MIDLAND AVE, NYACK, NY 10960|160 20 NORTH MIDLAND AVE, NYACK, NY 10960|0
79
  160-10 N MIDLAND AVE, NYACK, NY 10960|160-10 N MIDLAND AVENUE, NYACK, NY 10960|1
quick_evaluate.py CHANGED
@@ -33,7 +33,7 @@ a10= "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
33
  a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
34
  a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
35
  a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
36
- a14="257 37 US RT 11, EVANS MILLS, NY 13637"
37
  a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"
38
 
39
  a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
 
33
  a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
34
  a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
35
  a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
36
+ a14="257 37 US ROUTE 11, EVANS MILLS, NY 13637"
37
  a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"
38
 
39
  a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"