Feliks Zaslavskiy commited on
Commit
4132514
1 Parent(s): b5b5700
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +51 -4
  3. data.py +7 -5
  4. requirements.txt +1 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Strealit Data Synthesis Example
3
  emoji: 💩
4
  colorFrom: blue
5
  colorTo: gray
 
1
  ---
2
+ title: Address matching Example
3
  emoji: 💩
4
  colorFrom: blue
5
  colorTo: gray
app.py CHANGED
@@ -1,8 +1,31 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
 
 
 
4
 
5
- st.title('Excel File Uploader')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  st.markdown('Upload an Excel file to view the data in a table.')
8
 
@@ -13,20 +36,44 @@ if uploaded_file is not None:
13
  data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH')
14
  data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB')
15
 
 
 
16
  data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
17
  + np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)) \
18
  + data_caqh['city'].astype(str) + ', '\
19
  + data_caqh['state'].astype(str) + ', ' \
20
  + data_caqh['postalcode'].astype(str)
21
 
 
22
  data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
23
 
24
  data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
25
- np.where( data_ndb['zip_pls_4_cd'].isnull() , '', '-' \
26
  + data_ndb['zip_pls_4_cd'].astype(str))
27
 
28
- data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str) + ', ' \
29
  + data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
30
 
 
 
 
 
 
 
31
  st.dataframe(data_caqh)
32
- st.dataframe(data_ndb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
+ import torch
5
+ #from transformers import AlbertTokenizer, AlbertModel
6
+ #from sklearn.metrics.pairwise import cosine_similarity
7
 
8
+ #tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
9
+ #model = AlbertModel.from_pretrained("albert-base-v2")
10
+
11
+ #def get_embedding(input_text):
12
+ # encoded_input = tokenizer(input_text, return_tensors='pt')
13
+ # input_ids = encoded_input.input_ids
14
+ # input_num_tokens = input_ids.shape[1]
15
+ #
16
+ # #print( "Number of input tokens: " + str(input_num_tokens))
17
+ # #print("Length of input: " + str(len(input_text)))
18
+ #
19
+ # list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
20
+ #
21
+ # #print( "Tokens : " + ' '.join(list_of_tokens))
22
+ # with torch.no_grad():
23
+ # output = model(**encoded_input)
24
+ #
25
+ # embedding = output.last_hidden_state[0][0]
26
+ # return embedding.tolist()
27
+
28
+ st.title('Upload the Address Dataset')
29
 
30
  st.markdown('Upload an Excel file to view the data in a table.')
31
 
 
36
  data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH')
37
  data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB')
38
 
39
+ # Data cleaning CAQH
40
+ data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
41
  data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
42
  + np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)) \
43
  + data_caqh['city'].astype(str) + ', '\
44
  + data_caqh['state'].astype(str) + ', ' \
45
  + data_caqh['postalcode'].astype(str)
46
 
47
+ # Data cleaning NDB
48
  data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
49
 
50
  data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
51
+ np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
52
  + data_ndb['zip_pls_4_cd'].astype(str))
53
 
54
+ data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
55
  + data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
56
 
57
+ # Add a matched column
58
+ data_caqh['matched-addr'] = ''
59
+
60
+ # App
61
+ #data_caqh['embed'] = data_caqh['full-addr'].apply(get_embedding)
62
+
63
  st.dataframe(data_caqh)
64
+ st.dataframe(data_ndb)
65
+
66
+ # Do some matching
67
+ #data_caqh.loc[data_caqh['full-addr'] == '1000 Vale Terrace, Vista, CA, 92084', 'matched-addr'] = '456 Main St'
68
+ #time.sleep(10)
69
+ #st.dataframe(data_caqh)
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
data.py CHANGED
@@ -1,8 +1,10 @@
1
  from transformers import AlbertTokenizer, AlbertModel
2
  from sklearn.metrics.pairwise import cosine_similarity
3
 
4
- tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2')
5
- model = AlbertModel.from_pretrained("albert-large-v2")
 
 
6
 
7
  a1 = "65 Mountain Blvd Ext, Warren, NJ 07059"
8
  a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
@@ -32,11 +34,11 @@ e2 = get_embedding(a2)
32
  e4 = get_embedding(a4)
33
  e5 = get_embedding(a5)
34
 
35
- print("a1 to a2")
36
  print(cosine_similarity([e1], [e2]))
37
- print("a1 to a4")
38
  print(cosine_similarity([e1], [e4]))
39
- print("a1 to a5")
40
  print(cosine_similarity([e1], [e5]))
41
 
42
  # with base
 
1
  from transformers import AlbertTokenizer, AlbertModel
2
  from sklearn.metrics.pairwise import cosine_similarity
3
 
4
+ # base
5
+ # large
6
+ tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
7
+ model = AlbertModel.from_pretrained("albert-base-v2")
8
 
9
  a1 = "65 Mountain Blvd Ext, Warren, NJ 07059"
10
  a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
 
34
  e4 = get_embedding(a4)
35
  e5 = get_embedding(a5)
36
 
37
+ print(f"a1 {a1} to {a2} a2")
38
  print(cosine_similarity([e1], [e2]))
39
+ print(f"a1 {a1} to {a4} a4")
40
  print(cosine_similarity([e1], [e4]))
41
+ print(f"a1 {a1} to {a5} a5")
42
  print(cosine_similarity([e1], [e5]))
43
 
44
  # with base
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  streamlit
2
  pandas
 
3
  openpyxl
 
1
  streamlit
2
  pandas
3
+ numpy
4
  openpyxl