felix commited on
Commit
3004443
·
1 Parent(s): f101de7
Files changed (1) hide show
  1. data.py +56 -30
data.py CHANGED
@@ -1,13 +1,20 @@
1
- from transformers import AlbertTokenizer, AlbertModel
2
  from sklearn.metrics.pairwise import cosine_similarity
3
- import torch
4
 
5
- #This is a quick evaluation to see if B
6
 
7
  # base
8
  # large
9
- tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
10
- model = AlbertModel.from_pretrained("albert-base-v2")
 
 
 
 
 
 
 
11
 
12
  a1 = "65 Mountain Blvd Ext, Warren, NJ 07059"
13
  a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
@@ -15,33 +22,42 @@ a3 = "1677 NJ-27 #2, Edison, NJ 08817"
15
  a4 = "5078 S Maryland Pkwy, Las Vegas, NV 89119"
16
  a5 = "65 Mountain Boulevard Ext, Warren, NJ 07059"
17
  a6 = "123 Broad St, New York, NY, 10304-2345"
 
 
 
 
 
18
 
19
- def get_embedding(input_text):
20
- encoded_input = tokenizer(input_text, return_tensors='pt')
21
- input_ids = encoded_input.input_ids
22
- input_num_tokens = input_ids.shape[1]
23
-
24
- print( "Number of input tokens: " + str(input_num_tokens))
25
- print("Length of input: " + str(len(input_text)))
26
-
27
- list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
28
-
29
- print( "Tokens : " + ' '.join(list_of_tokens))
30
- with torch.no_grad():
31
-
32
- outputs = model(**encoded_input)
33
- last_hidden_states = outputs[0]
34
- sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
35
- #sentence_embedding = output.last_hidden_state[0][0]
36
- return sentence_embedding.tolist()
37
-
38
- e1 = get_embedding(a1)
39
- e2 = get_embedding(a2)
40
- #e3 = get_embedding(a3)
41
- e4 = get_embedding(a4)
42
- e5 = get_embedding(a5)
43
- e6 = get_embedding(a6)
44
 
 
 
 
 
 
 
 
 
 
 
 
45
  print(f"a1 \"{a1}\" to \"{a2}\" a2")
46
  print(cosine_similarity([e1], [e2]))
47
  print(f"a1 \"{a1}\" to \"{a4}\" a4")
@@ -49,6 +65,16 @@ print(cosine_similarity([e1], [e4]))
49
  print(f"a1 \"{a1}\" to \"{a5}\" a5")
50
  print(cosine_similarity([e1], [e5]))
51
 
 
 
 
 
 
 
 
 
 
 
52
  # with base
53
  #a1 to a2
54
  #[[0.99512167]]
 
1
+ #from transformers import AlbertTokenizer, AlbertModel
2
  from sklearn.metrics.pairwise import cosine_similarity
3
+ from sentence_transformers import SentenceTransformer
4
 
5
+ #This is a quick evaluation on a few cases
6
 
7
  # base
8
  # large
9
+ #tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
10
+ #model = AlbertModel.from_pretrained("albert-base-v2")
11
+ #'sentence-transformers/paraphrase-albert-base-v2'
12
+ model_name = 'output/training_OnlineConstrativeLoss-2023-03-09_23-55-34'
13
+ model_sbert = SentenceTransformer(model_name)
14
+
15
+ def get_sbert_embedding(input_text):
16
+ embedding = model_sbert.encode(input_text)
17
+ return embedding.tolist()
18
 
19
  a1 = "65 Mountain Blvd Ext, Warren, NJ 07059"
20
  a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
 
22
  a4 = "5078 S Maryland Pkwy, Las Vegas, NV 89119"
23
  a5 = "65 Mountain Boulevard Ext, Warren, NJ 07059"
24
  a6 = "123 Broad St, New York, NY, 10304-2345"
25
+ a7 = "440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034"
26
+ a8 = "200 Technology Center Drive, Boston, MA 10034"
27
+ a8x= "87 Technology Center Drive, Boston, MA 10034"
28
+ a9 = "440 Technology Center Dr., Boston, MA 10034-0345"
29
+ a10 = "440 Technology Center Dr., Boston, MA 10034"
30
 
31
+ #def get_embedding(input_text):
32
+ # encoded_input = tokenizer(input_text, return_tensors='pt')
33
+ # input_ids = encoded_input.input_ids
34
+ # input_num_tokens = input_ids.shape[1]
35
+ #
36
+ # print( "Number of input tokens: " + str(input_num_tokens))
37
+ # print("Length of input: " + str(len(input_text)))
38
+ #
39
+ # list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
40
+ #
41
+ # print( "Tokens : " + ' '.join(list_of_tokens))
42
+ # with torch.no_grad():
43
+ #
44
+ # outputs = model(**encoded_input)
45
+ # last_hidden_states = outputs[0]
46
+ # sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
47
+ # #sentence_embedding = output.last_hidden_state[0][0]
48
+ # return sentence_embedding.tolist()
 
 
 
 
 
 
 
49
 
50
+ e1 = get_sbert_embedding(a1)
51
+ e2 = get_sbert_embedding(a2)
52
+ #e3 = get_sbert_embedding(a3)
53
+ e4 = get_sbert_embedding(a4)
54
+ e5 = get_sbert_embedding(a5)
55
+ e6 = get_sbert_embedding(a6)
56
+ e7 = get_sbert_embedding(a7)
57
+ e8 = get_sbert_embedding(a8)
58
+ e8x = get_sbert_embedding(a8x)
59
+ e9 = get_sbert_embedding(a9)
60
+ e10 = get_sbert_embedding(a10)
61
  print(f"a1 \"{a1}\" to \"{a2}\" a2")
62
  print(cosine_similarity([e1], [e2]))
63
  print(f"a1 \"{a1}\" to \"{a4}\" a4")
 
65
  print(f"a1 \"{a1}\" to \"{a5}\" a5")
66
  print(cosine_similarity([e1], [e5]))
67
 
68
+ print(f"a7 \"{a7}\" to \"{a8}\" a8")
69
+ print(cosine_similarity([e7], [e8]))
70
+ print(f"a7 \"{a7}\" to \"{a8x}\" a8x")
71
+ print(cosine_similarity([e7], [e8x]))
72
+
73
+ print(f"a7 \"{a7}\" to \"{a9}\" a9")
74
+ print(cosine_similarity([e7], [e9]))
75
+
76
+ print(f"a7 \"{a7}\" to \"{a10}\" a10")
77
+ print(cosine_similarity([e7], [e10]))
78
  # with base
79
  #a1 to a2
80
  #[[0.99512167]]