Feliks Zaslavskiy commited on
Commit
ce71282
1 Parent(s): 2627f58

training data

Browse files
Files changed (3) hide show
  1. data_set_training.csv +5 -0
  2. dev_set_training.csv +3 -1
  3. quick_evaluate.py +35 -19
data_set_training.csv CHANGED
@@ -232,3 +232,8 @@ MEMORIAL SATILLA HEALTH, 1900 TEBEAU ST, WAYCROSS, GA 31501|1900 TEBEAU STREET,
232
  VA MEDICAL CENTER 2002 HOLCOMBE BLVD, HOUSTON, TX 77030|VA MEDICAL CENTER 2002 HOLCOMBE BOULEVARD, HOUSTON, TX 77030|1
233
  VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENNING RD, COLUMBUS, GA 31903|1
234
  VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENNING RD, VALLEY HEALTHCARE SYSTEM, COLUMBUS, GA 31903|1
 
 
 
 
 
 
232
  VA MEDICAL CENTER 2002 HOLCOMBE BLVD, HOUSTON, TX 77030|VA MEDICAL CENTER 2002 HOLCOMBE BOULEVARD, HOUSTON, TX 77030|1
233
  VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENNING RD, COLUMBUS, GA 31903|1
234
  VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENNING RD, VALLEY HEALTHCARE SYSTEM, COLUMBUS, GA 31903|1
235
+ 315 22 BRAVERTON ST #110, EDGEWATER, MD 21037|315-22 BRAVERTON ST #110, EDGEWATER, MD 21037|1
236
+ 165 10 VILLAGE DR W, UPPER MARLBORO, MD 20772|165 12 VILLAGE DR W, UPPER MARLBORO, MD 20772|0
237
+ 345 12 OLD WASHINGTON RD, WALDORF, MD 20602|345-12 OLD WASHINGTON RD, WALDORF, MD 20602|1
238
+ 144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-11 ONYX CT, FREDERICKSBURG, VA 22407|0
239
+ 144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-12 ONYX CT, FREDERICKSBURG, VA 22407|1
dev_set_training.csv CHANGED
@@ -24,4 +24,6 @@ ADDRESS1|ADDRESS2|ARE_SAME
24
  87-44 ROUTE 13, CORTLANDVILLE, NY 13045|87 24 ROUTE 13, CORTLANDVILLE, NY 13045|0
25
  872 ROUTE 13, CORTLANDVILLE, NY 13045|87-2 ROUTE 13, CORTLANDVILLE ,NY 13045|1
26
  8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
27
- HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
 
 
 
24
  87-44 ROUTE 13, CORTLANDVILLE, NY 13045|87 24 ROUTE 13, CORTLANDVILLE, NY 13045|0
25
  872 ROUTE 13, CORTLANDVILLE, NY 13045|87-2 ROUTE 13, CORTLANDVILLE ,NY 13045|1
26
  8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
27
+ HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
28
+ 115 34 SHOREWAY DR, QUEENSTOWN, MD 21658|115-43 SHOREWAY DR, QUEENSTOWN, MD 21658|0
29
+ 112 24 SHOREWAY DR, QUEENSTOWN, MD 21658|112-24 SHOREWAY DR, QUEENSTOWN, MD 21658|1
quick_evaluate.py CHANGED
@@ -19,22 +19,27 @@ def get_sbert_embedding(input_text):
19
  embedding = model_sbert.encode(input_text)
20
  return embedding.tolist()
21
 
22
- a1 = "65 Mountain Blvd Ext, Warren, NJ 07059"
23
- a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
24
- a3 = "1677 NJ-27 #2, Edison, NJ 08817"
25
- a4 = "5078 S Maryland Pkwy, Las Vegas, NV 89119"
26
- a5 = "65 Mountain Boulevard Ext, Warren, NJ 07059"
27
- a6 = "123 Broad St, New York, NY, 10304-2345"
28
- a7 = "440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034"
29
- a8 = "200 Technology Center Drive, Boston, MA 10034"
30
- a8x= "87 Technology Center Drive, Boston, MA 10034"
31
- a9 = "440 Technology Center Dr., Boston, MA 10034-0345"
32
- a10 = "440 Technology Center Dr., Boston, MA 10034"
33
- a11="872 Route 13, Cortlandville NY 13045"
34
- a12="87-2 Route 13, Cortlandville NY 13045"
35
- a13="87-5 Route 13, Cortlandville NY 13045"
36
- a14="257 37 US Rt 11, Evans Mills NY 13637"
37
- a15="257-37 US Route 11, Evans Mills NY 13637"
 
 
 
 
 
38
  #def get_embedding(input_text):
39
  # encoded_input = tokenizer(input_text, return_tensors='pt')
40
  # input_ids = encoded_input.input_ids
@@ -63,7 +68,7 @@ e6 = get_sbert_embedding(a6)
63
  e7 = get_sbert_embedding(a7)
64
  e8 = get_sbert_embedding(a8)
65
  e8x = get_sbert_embedding(a8x)
66
- e9 = get_sbert_embedding(a9)
67
  e10 = get_sbert_embedding(a10)
68
  e11 = get_sbert_embedding(a11)
69
  e12 = get_sbert_embedding(a12)
@@ -71,6 +76,10 @@ e13 = get_sbert_embedding(a13)
71
  e14 = get_sbert_embedding(a14)
72
  e15 = get_sbert_embedding(a15)
73
 
 
 
 
 
74
  print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
75
  print(cosine_similarity([e1], [e2]))
76
  print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
@@ -83,8 +92,8 @@ print(cosine_similarity([e7], [e8]))
83
  print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
84
  print(cosine_similarity([e7], [e8x]))
85
 
86
- print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
87
- print(cosine_similarity([e7], [e9]))
88
 
89
  print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
90
  print(cosine_similarity([e7], [e10]))
@@ -97,6 +106,13 @@ print(cosine_similarity([e11], [e13]))
97
 
98
  print(f"a14 \"{a14}\" to \"{a15}\" a15 - expected Same")
99
  print(cosine_similarity([e14], [e15]))
 
 
 
 
 
 
 
100
  # with base
101
  #a1 to a2
102
  #[[0.99512167]]
 
19
  embedding = model_sbert.encode(input_text)
20
  return embedding.tolist()
21
 
22
+ a1 = "65 MOUNTAIN BLVD EXT, WARREN, NJ 07059"
23
+ a2 = "112 MOUNTAIN BLVD EXT, WARREN, NJ 07059"
24
+ a3 = "1677 NJ-27 #2, EDISON, NJ 08817"
25
+ a4 = "5078 S MARYLAND PKWY, LAS VEGAS, NV 89119"
26
+ a5 = "65 MOUNTAIN BOULEVARD EXT, WARREN, NJ 07059"
27
+ a6 = "123 BROAD ST, NEW YORK, NY, 10304-2345"
28
+ a7 = "440 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
29
+ a8 = "200 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
30
+ a8x= "87 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
31
+ #a9 = "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
32
+ a10= "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
33
+ a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
34
+ a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
35
+ a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
36
+ a14="257 37 US RT 11, EVANS MILLS, NY 13637"
37
+ a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"
38
+
39
+ a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
40
+ a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
41
+ a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
42
+
43
  #def get_embedding(input_text):
44
  # encoded_input = tokenizer(input_text, return_tensors='pt')
45
  # input_ids = encoded_input.input_ids
 
68
  e7 = get_sbert_embedding(a7)
69
  e8 = get_sbert_embedding(a8)
70
  e8x = get_sbert_embedding(a8x)
71
+ #e9 = get_sbert_embedding(a9)
72
  e10 = get_sbert_embedding(a10)
73
  e11 = get_sbert_embedding(a11)
74
  e12 = get_sbert_embedding(a12)
 
76
  e14 = get_sbert_embedding(a14)
77
  e15 = get_sbert_embedding(a15)
78
 
79
+ e16 = get_sbert_embedding(a16)
80
+ e17 = get_sbert_embedding(a17)
81
+ e18 = get_sbert_embedding(a18)
82
+
83
  print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
84
  print(cosine_similarity([e1], [e2]))
85
  print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
 
92
  print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
93
  print(cosine_similarity([e7], [e8x]))
94
 
95
+ #print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
96
+ #print(cosine_similarity([e7], [e9]))
97
 
98
  print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
99
  print(cosine_similarity([e7], [e10]))
 
106
 
107
  print(f"a14 \"{a14}\" to \"{a15}\" a15 - expected Same")
108
  print(cosine_similarity([e14], [e15]))
109
+
110
+ print(f"a16 \"{a16}\" to \"{a17}\" a17 - expected Same")
111
+ print(cosine_similarity([e16], [e17]))
112
+
113
+ print(f"a16 \"{a16}\" to \"{a18}\" a18 - expected Different")
114
+ print(cosine_similarity([e16], [e18]))
115
+
116
  # with base
117
  #a1 to a2
118
  #[[0.99512167]]