Feliks Zaslavskiy commited on
Commit
ecdea0f
1 Parent(s): cf9bb91
Files changed (5) hide show
  1. app.py +1 -0
  2. data_set_training.csv +30 -1
  3. dev_set_training.csv +9 -1
  4. eval.py +6 -0
  5. quick_evaluate.py +15 -3
app.py CHANGED
@@ -15,6 +15,7 @@ from io import BytesIO
15
 
16
  # For baseline 'sentence-transformers/paraphrase-albert-base-v2'
17
  model_name = 'output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
 
18
 
19
  similarity_threshold = 0.9
20
 
 
15
 
16
  # For baseline 'sentence-transformers/paraphrase-albert-base-v2'
17
  model_name = 'output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
18
+ model_name = 'output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
19
 
20
  similarity_threshold = 0.9
21
 
data_set_training.csv CHANGED
@@ -239,4 +239,33 @@ VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENN
239
  165 10 VILLAGE DR W, UPPER MARLBORO, MD 20772|165 12 VILLAGE DR W, UPPER MARLBORO, MD 20772|0
240
  345 12 OLD WASHINGTON RD, WALDORF, MD 20602|345-12 OLD WASHINGTON RD, WALDORF, MD 20602|1
241
  144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-11 ONYX CT, FREDERICKSBURG, VA 22407|0
242
- 144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-12 ONYX CT, FREDERICKSBURG, VA 22407|1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  165 10 VILLAGE DR W, UPPER MARLBORO, MD 20772|165 12 VILLAGE DR W, UPPER MARLBORO, MD 20772|0
240
  345 12 OLD WASHINGTON RD, WALDORF, MD 20602|345-12 OLD WASHINGTON RD, WALDORF, MD 20602|1
241
  144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-11 ONYX CT, FREDERICKSBURG, VA 22407|0
242
+ 144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-12 ONYX CT, FREDERICKSBURG, VA 22407|1
243
+ 14453 UNION ST, Mc Coll, SC 29570|144-53 UNION ST, Mc Coll, SC 29570|1
244
+ 14453 UNION ST, Mc Coll, SC 29570|144 53 UNION ST, Mc Coll, SC 29570|0
245
+ 14453 UNION ST, Mc Coll, SC 29570|14 453 UNION STREET, Mc Coll, SC 29570|1
246
+ 14453 UNION ST APT 343, Mc Coll, SC 29570|144 53 UNION ST APT 343, Mc Coll, SC 29570|1
247
+ 14453 UNION ST, Mc Coll, SC 29570|144-53A UNION STREET, Mc Coll, SC 29570|0
248
+ 14453 UNION ST, Mc Coll, SC 29570|14443 UNION ST, Mc Coll, SC 29570|0
249
+ 14453 UNION ST, Mc Coll, SC 29570|144-53 UNION ST APT 343, Mc Coll, SC 29570|0
250
+ 20334 PARK AVE, PARK CITY, UT 84060|20234 PARK AVE, PARK CITY, UT 84060|0
251
+ 20334 PARK AVE, PARK CITY, UT 84060|20-334 PARK AVE, PARK CITY, UT 84060|0
252
+ 20334 PARK AVE, PARK CITY, UT 84060|202-34 PARK AVENUE, PARK CITY, UT 84060|1
253
+ 20334 PARK AVE, PARK CITY SUITE 2, UT 84060|202 34 PARK AVENUE STE 2, PARK CITY, UT 84060|1
254
+ 203 MAPLE AVE FL 2, ENGLEWOOD, NJ 07631|203 MAPLE AVE, ENGLEWOOD, NJ 07631|1
255
+ 203 MAPLE AVE FL 2, ENGLEWOOD, NJ 07631|203 MAPLE AVENUE, ENGLEWOOD, NJ 07631|1
256
+ 203 MAPLE AVE FL 2 STE 3, ENGLEWOOD, NJ 07631|203 MAPLE AVE, ENGLEWOOD, NJ 07631|0
257
+ 203 MAPLE AVE, ENGLEWOOD, NJ 07631|205 MAPLE AVE, ENGLEWOOD, NJ 07631|0
258
+ 2032 MAPLE AVE, ENGLEWOOD, NJ 07631|2031 MAPLE AVE, ENGLEWOOD, NJ 07631|0
259
+ 1427 MARVIN GRIFFIN RD, AUGUSTA, GA 30906|1417 MARVIN GRIFFIN RD, AUGUSTA, GA 30906|0
260
+ 32 GRAND ST, NEDERLAND, TX 77627|33 GRAND ST, NEDERLAND, TX 77627|0
261
+ 32 GRAND ST, NEDERLAND, TX 77627|32 GRAND ST #4, NEDERLAND, TX 77627|0
262
+ 80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80 HOSPITAL DRIVE SUITE 6, BARBOURVILLE, KY 40906|1
263
+ 80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80 HOSPITAL DR. STE. 6, BARBOURVILLE KY, 40906|1
264
+ 80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|SUITE #6, 80 HOSPITAL DRIVE, BARBOURVILLE, KY 40906|1
265
+ 80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|STE #6 - 80 HOSPITAL DR., BARBOURVILLE, KY 40906|1
266
+ 80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|#6-80 HOSPITAL DRIVE, BARBOURVILLE, KY 40906|1
267
+ 80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80-2 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|0
268
+ 80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80 HOSPITAL DR SUITE 6A, BARBOURVILLE, KY 40906|0
269
+ 80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|81 HOSPITAL DRIVE STE 6, BARBOURVILLE, KY 40906|0
270
+ 80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|82 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|0
271
+ 80 22 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|8022 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|1
dev_set_training.csv CHANGED
@@ -26,4 +26,12 @@ ADDRESS1|ADDRESS2|ARE_SAME
26
  8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
27
  HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
28
  115 34 SHOREWAY DR, QUEENSTOWN, MD 21658|115-43 SHOREWAY DR, QUEENSTOWN, MD 21658|0
29
- 112 24 SHOREWAY DR, QUEENSTOWN, MD 21658|112-24 SHOREWAY DR, QUEENSTOWN, MD 21658|1
 
 
 
 
 
 
 
 
 
26
  8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
27
  HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
28
  115 34 SHOREWAY DR, QUEENSTOWN, MD 21658|115-43 SHOREWAY DR, QUEENSTOWN, MD 21658|0
29
+ 112 24 SHOREWAY DR, QUEENSTOWN, MD 21658|112-24 SHOREWAY DR, QUEENSTOWN, MD 21658|1
30
+ 3619 S 22ND DR, YUMA, AZ 85364|3636 S 22ND DR, YUMA, AZ 85364|0
31
+ 7325 FRANKLIN BLVD, SACRAMENTO, CA 95823|73235 FRANKLIN BLVD, SACRAMENTO, CA 95823|0
32
+ 3660 MAIN ST, TUCSON, AZ 85721|3701 MAIN ST, TUCSON, AZ 85721|0
33
+ 3910 MAGNET RD, MALVERN, AR 72104|3910 MAGNET RD, STE 206 MALVERN, AR 72104|0
34
+ 15702 OBERLIN RD, RALEIGH, NC 27605|15702 OBERLIN RD FL 1, RALEIGH, NC 27605|1
35
+ 14425 ROOSOVELT AVE APT 322, LA JOLLA, CA 92092|14325 ROOSOVELT AVE, LA JOLLA, CA 92092|0
36
+ 14425 ROOSOVELT AVE APT 322, LA JOLLA, CA 92092|144-25 ROOSOVELT AVE APT 322, LA JOLLA, CA 92092|1
37
+ 14425 ROOSOVELT AVE, LA JOLLA, CA 92092|144-25A ROOSOVELT AVENUE, LA JOLLA, CA 92092|0
eval.py CHANGED
@@ -13,6 +13,12 @@ logger = logging.getLogger(__name__)
13
 
14
  model_name = 'sentence-transformers/paraphrase-albert-base-v2'
15
 
 
 
 
 
 
 
16
 
17
  model_sbert = SentenceTransformer(model_name)
18
 
 
13
 
14
  model_name = 'sentence-transformers/paraphrase-albert-base-v2'
15
 
16
+ #model_name='output/training_OnlineConstrativeLoss-2023-03-11_23-47-34'
17
+ #model_name= 'output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
18
+
19
+ #86% so far
20
+ model_name = 'output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
21
+
22
 
23
  model_sbert = SentenceTransformer(model_name)
24
 
quick_evaluate.py CHANGED
@@ -1,4 +1,4 @@
1
- #from transformers import AlbertTokenizer, AlbertModel
2
  from sklearn.metrics.pairwise import cosine_similarity
3
  from sentence_transformers import SentenceTransformer
4
 
@@ -6,15 +6,16 @@ from sentence_transformers import SentenceTransformer
6
 
7
  # base
8
  # large
9
- #tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
10
  #model = AlbertModel.from_pretrained("albert-base-v2")
11
  #'sentence-transformers/paraphrase-albert-base-v2'
12
  model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
13
  model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
14
  model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
15
- model_name='output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
16
  model_sbert = SentenceTransformer(model_name)
17
 
 
18
  def get_sbert_embedding(input_text):
19
  embedding = model_sbert.encode(input_text)
20
  return embedding.tolist()
@@ -40,6 +41,17 @@ a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
40
  a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
41
  a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
42
 
 
 
 
 
 
 
 
 
 
 
 
43
  #def get_embedding(input_text):
44
  # encoded_input = tokenizer(input_text, return_tensors='pt')
45
  # input_ids = encoded_input.input_ids
 
1
+ from transformers import AlbertTokenizer, AlbertModel
2
  from sklearn.metrics.pairwise import cosine_similarity
3
  from sentence_transformers import SentenceTransformer
4
 
 
6
 
7
  # base
8
  # large
9
+ tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
10
  #model = AlbertModel.from_pretrained("albert-base-v2")
11
  #'sentence-transformers/paraphrase-albert-base-v2'
12
  model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
13
  model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
14
  model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
15
+ model_name='output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
16
  model_sbert = SentenceTransformer(model_name)
17
 
18
+
19
  def get_sbert_embedding(input_text):
20
  embedding = model_sbert.encode(input_text)
21
  return embedding.tolist()
 
41
  a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
42
  a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
43
 
44
+ a19 = "THE PAVILION AT QUEENS FOR REHABILITAION AND NURSING 36-17 PARSONS BOULEVARD, FLUSHING, NY 11354"
45
+ a20 = "136-17 39TH AVENUE, 4TH FLOOR, SUITE CF-E, FLUSHING, NY 11354"
46
+ a21="WISDOM MEDICAL P.C., 136-20 38 TH AVE 6E, FLUSHING, NY 11354"
47
+
48
+ encoded_input = tokenizer(a21, return_tensors='pt')
49
+ input_ids = encoded_input.input_ids
50
+ input_num_tokens = input_ids.shape[1]
51
+ print(input_num_tokens)
52
+ list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
53
+ #
54
+ print( "Tokens : " + ' '.join(list_of_tokens))
55
  #def get_embedding(input_text):
56
  # encoded_input = tokenizer(input_text, return_tensors='pt')
57
  # input_ids = encoded_input.input_ids