wldmr commited on
Commit
2f47f53
·
1 Parent(s): 3d0672e

instructor

Browse files
Files changed (2) hide show
  1. app.py +63 -1
  2. context.py +84 -0
app.py CHANGED
@@ -1,9 +1,11 @@
1
 
2
  import streamlit as st
3
- import pandas as pd
4
 
 
5
  from sentence_transformers import SentenceTransformer, util
6
 
 
 
7
 
8
  def sentence_sim(sentence1, sentence2):
9
  #model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
@@ -16,9 +18,69 @@ def sentence_sim(sentence1, sentence2):
16
  cos_scores = util.pytorch_cos_sim(embedding1, embedding2).cpu().numpy()
17
  return cos_scores[0][0]
18
 
 
 
 
 
19
  st.title('Similarity Computations')
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  if st.button('Cos Sim MiniLM'):
23
  #title = "I Tried Using ChatGPT To Earn $6,147 In Just 1 Week"
24
  #summary = "Unveiling the Reality: The Perils of Using ChatGPT for Content Generation and Monetization"
 
1
 
2
  import streamlit as st
 
3
 
4
+ import pandas as pd
5
  from sentence_transformers import SentenceTransformer, util
6
 
7
+ from transformers import AutoTokenizer, pipeline
8
+ import numpy as np
9
 
10
  def sentence_sim(sentence1, sentence2):
11
  #model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
 
18
  cos_scores = util.pytorch_cos_sim(embedding1, embedding2).cpu().numpy()
19
  return cos_scores[0][0]
20
 
21
+
22
+ def dot_product(v1, v2):
23
+ return round(np.dot(v1, v2), 3)
24
+
25
  st.title('Similarity Computations')
26
 
27
 
28
+ if st.button('Sentence Transformer'):
29
+ model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
30
+ sentencetriplets = ["record the play", "play the record", "play the game"]
31
+ embedding_vec = {}
32
+ embedding_vec[0] = model.encode(sentencetriplets[0])
33
+ embedding_vec[1] = model.encode(sentencetriplets[1])
34
+
35
+ if st.button('Context Sim Bert'):
36
+ model = 'bert-base-uncased'
37
+ framework = 'tf'
38
+ tokenizer = AutoTokenizer.from_pretrained(model)
39
+ feature_extractor = pipeline(
40
+ model=model,
41
+ framework=framework,
42
+ tokenizer=tokenizer,
43
+ task="feature-extraction",
44
+ )
45
+
46
+ sentencetriplets = ["record the play", "play the record", "play the game"]
47
+ index = 0
48
+ #sentence = sentencetriplets[index]
49
+ test_word = 'play'
50
+ test_word_vector = {}
51
+ for index, sentence in enumerate(sentencetriplets):
52
+ tokens = tokenizer.tokenize(sentence)
53
+ vectors = feature_extractor(sentence, return_tensors=True).numpy()
54
+ test_word_location = [i for i in range(len(tokens)) if test_word == tokens[i]][0]
55
+ test_word_vector[index] = vectors[0, test_word_location + 1, :] # 0 is '[CLS]'
56
+ magnitude = np.linalg.norm(test_word_vector[index])
57
+ test_word_vector[index] = test_word_vector[index] / magnitude
58
+
59
+ dot_product(test_word_vector[0], test_word_vector[1])
60
+ dot_product(test_word_vector[1], test_word_vector[2])
61
+ dot_product(test_word_vector[0], test_word_vector[2])
62
+
63
+ if st.button('Instructor'):
64
+ from InstructorEmbedding import INSTRUCTOR
65
+ model = INSTRUCTOR('hkunlp/instructor-large')
66
+ sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
67
+ instruction = "Represent the Science title:"
68
+ embeddings = model.encode([[instruction,sentence]])
69
+ st.write(instruction)
70
+ st.write(embeddings)
71
+
72
+ from sklearn.metrics.pairwise import cosine_similarity
73
+ sentences_a = [['Represent the Science sentence: ','Parton energy loss in QCD matter'],
74
+ ['Represent the Financial statement: ','The Federal Reserve on Wednesday raised its benchmark interest rate.']]
75
+ sentences_b = [['Represent the Science sentence: ','The Chiral Phase Transition in Dissipative Dynamics'],
76
+ ['Represent the Financial statement: ','The funds rose less than 0.5 per cent on Friday']]
77
+ embeddings_a = model.encode(sentences_a)
78
+ embeddings_b = model.encode(sentences_b)
79
+ similarities = cosine_similarity(embeddings_a,embeddings_b)
80
+ st.write(sentences_a)
81
+ st.write(sentences_b)
82
+ st.write(similarities)
83
+
84
  if st.button('Cos Sim MiniLM'):
85
  #title = "I Tried Using ChatGPT To Earn $6,147 In Just 1 Week"
86
  #summary = "Unveiling the Reality: The Perils of Using ChatGPT for Content Generation and Monetization"
context.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Any
3
+
4
+ import numpy as np
5
+ from transformers import AutoTokenizer, pipeline
6
+
7
+ __all__ = ["ContextAwareWordVectors", "print_results"]
8
+
9
+
10
+ class NumpyFloatValuesEncoder(json.JSONEncoder):
11
+ def default(self, obj: Any) -> Any:
12
+ if isinstance(obj, np.float32):
13
+ return round(float(obj), 3)
14
+ return json.JSONEncoder.default(self, obj)
15
+
16
+
17
+ def print_results():
18
+ with open("sentences.json", encoding="utf-8") as fp:
19
+ samples = json.load(fp)
20
+ context_aware_word_vectors = ContextAwareWordVectors(model="bert-base-uncased")
21
+ results = context_aware_word_vectors.run(samples)
22
+ print(json.dumps(results, indent=2, cls=NumpyFloatValuesEncoder))
23
+
24
+
25
+ class ContextAwareWordVectors:
26
+ def __init__(self, model: str, framework: str = "tf") -> None:
27
+ self.framework = framework
28
+ self.model = model
29
+ self.tokenizer = AutoTokenizer.from_pretrained(model)
30
+ self.feature_extractor = pipeline(
31
+ model=model,
32
+ framework=framework,
33
+ tokenizer=self.tokenizer,
34
+ task="feature-extraction",
35
+ )
36
+
37
+ def dot_product(self, v1: Any, v2: Any) -> Any:
38
+ return round(np.dot(v1, v2), 3)
39
+
40
+ def euclidean_distance(self, v1: Any, v2: Any) -> Any:
41
+ return round(np.linalg.norm(v1 - v2), 3)
42
+
43
+ def manhattan_distance(self, v1: Any, v2: Any) -> Any:
44
+ return round(np.linalg.norm(v1 - v2, ord=1), 3)
45
+
46
+ def run(self, samples: dict[str, dict[str, str]]) -> dict[str, dict[str, Any]]:
47
+ test_word_vector: dict[str, Any]
48
+ results: dict[str, dict[str, Any]] = {}
49
+
50
+ for test_word, sample in samples.items():
51
+ results[test_word] = {}
52
+ test_word_vector = {}
53
+ for index, sentence in sample.items():
54
+ tokens = self.tokenizer.tokenize(sentence)
55
+ vectors = self.feature_extractor(sentence, return_tensors=True).numpy()
56
+ test_word_location = [
57
+ i for i in range(len(tokens)) if test_word == tokens[i]
58
+ ][0]
59
+ test_word_vector[index] = vectors[
60
+ 0, test_word_location + 1, :
61
+ ] # 0 is '[CLS]'
62
+ magnitude = np.linalg.norm(test_word_vector[index])
63
+ test_word_vector[index] = test_word_vector[index] / magnitude
64
+ results[test_word]["sentences"] = sample
65
+ results[test_word]["dot_product"] = [
66
+ self.dot_product(test_word_vector["1"], test_word_vector["2"]),
67
+ self.dot_product(test_word_vector["2"], test_word_vector["3"]),
68
+ self.dot_product(test_word_vector["3"], test_word_vector["1"]),
69
+ ]
70
+ results[test_word]["euclidean_distance"] = [
71
+ self.euclidean_distance(test_word_vector["1"], test_word_vector["2"]),
72
+ self.euclidean_distance(test_word_vector["2"], test_word_vector["3"]),
73
+ self.euclidean_distance(test_word_vector["3"], test_word_vector["1"]),
74
+ ]
75
+ results[test_word]["manhattan_distance"] = [
76
+ self.manhattan_distance(test_word_vector["1"], test_word_vector["2"]),
77
+ self.manhattan_distance(test_word_vector["2"], test_word_vector["3"]),
78
+ self.manhattan_distance(test_word_vector["3"], test_word_vector["1"]),
79
+ ]
80
+ return results
81
+
82
+
83
+ if __name__ == "__main__":
84
+ print_results()