Spaces:
Sleeping
Sleeping
instructor
Browse files- app.py +63 -1
- context.py +84 -0
app.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
|
2 |
import streamlit as st
|
3 |
-
import pandas as pd
|
4 |
|
|
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
|
|
|
|
|
7 |
|
8 |
def sentence_sim(sentence1, sentence2):
|
9 |
#model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
|
@@ -16,9 +18,69 @@ def sentence_sim(sentence1, sentence2):
|
|
16 |
cos_scores = util.pytorch_cos_sim(embedding1, embedding2).cpu().numpy()
|
17 |
return cos_scores[0][0]
|
18 |
|
|
|
|
|
|
|
|
|
19 |
st.title('Similarity Computations')
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
if st.button('Cos Sim MiniLM'):
|
23 |
#title = "I Tried Using ChatGPT To Earn $6,147 In Just 1 Week"
|
24 |
#summary = "Unveiling the Reality: The Perils of Using ChatGPT for Content Generation and Monetization"
|
|
|
1 |
|
2 |
import streamlit as st
|
|
|
3 |
|
4 |
+
import pandas as pd
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
|
7 |
+
from transformers import AutoTokenizer, pipeline
|
8 |
+
import numpy as np
|
9 |
|
10 |
def sentence_sim(sentence1, sentence2):
|
11 |
#model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
|
|
|
18 |
cos_scores = util.pytorch_cos_sim(embedding1, embedding2).cpu().numpy()
|
19 |
return cos_scores[0][0]
|
20 |
|
21 |
+
|
22 |
+
def dot_product(v1, v2):
|
23 |
+
return round(np.dot(v1, v2), 3)
|
24 |
+
|
25 |
st.title('Similarity Computations')
|
26 |
|
27 |
|
28 |
+
if st.button('Sentence Transformer'):
|
29 |
+
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
30 |
+
sentencetriplets = ["record the play", "play the record", "play the game"]
|
31 |
+
embedding_vec = {}
|
32 |
+
embedding_vec[0] = model.encode(sentencetriplets[0])
|
33 |
+
embedding_vec[1] = model.encode(sentencetriplets[1])
|
34 |
+
|
35 |
+
if st.button('Context Sim Bert'):
|
36 |
+
model = 'bert-base-uncased'
|
37 |
+
framework = 'tf'
|
38 |
+
tokenizer = AutoTokenizer.from_pretrained(model)
|
39 |
+
feature_extractor = pipeline(
|
40 |
+
model=model,
|
41 |
+
framework=framework,
|
42 |
+
tokenizer=tokenizer,
|
43 |
+
task="feature-extraction",
|
44 |
+
)
|
45 |
+
|
46 |
+
sentencetriplets = ["record the play", "play the record", "play the game"]
|
47 |
+
index = 0
|
48 |
+
#sentence = sentencetriplets[index]
|
49 |
+
test_word = 'play'
|
50 |
+
test_word_vector = {}
|
51 |
+
for index, sentence in enumerate(sentencetriplets):
|
52 |
+
tokens = tokenizer.tokenize(sentence)
|
53 |
+
vectors = feature_extractor(sentence, return_tensors=True).numpy()
|
54 |
+
test_word_location = [i for i in range(len(tokens)) if test_word == tokens[i]][0]
|
55 |
+
test_word_vector[index] = vectors[0, test_word_location + 1, :] # 0 is '[CLS]'
|
56 |
+
magnitude = np.linalg.norm(test_word_vector[index])
|
57 |
+
test_word_vector[index] = test_word_vector[index] / magnitude
|
58 |
+
|
59 |
+
dot_product(test_word_vector[0], test_word_vector[1])
|
60 |
+
dot_product(test_word_vector[1], test_word_vector[2])
|
61 |
+
dot_product(test_word_vector[0], test_word_vector[2])
|
62 |
+
|
63 |
+
if st.button('Instructor'):
|
64 |
+
from InstructorEmbedding import INSTRUCTOR
|
65 |
+
model = INSTRUCTOR('hkunlp/instructor-large')
|
66 |
+
sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
|
67 |
+
instruction = "Represent the Science title:"
|
68 |
+
embeddings = model.encode([[instruction,sentence]])
|
69 |
+
st.write(instruction)
|
70 |
+
st.write(embeddings)
|
71 |
+
|
72 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
73 |
+
sentences_a = [['Represent the Science sentence: ','Parton energy loss in QCD matter'],
|
74 |
+
['Represent the Financial statement: ','The Federal Reserve on Wednesday raised its benchmark interest rate.']]
|
75 |
+
sentences_b = [['Represent the Science sentence: ','The Chiral Phase Transition in Dissipative Dynamics'],
|
76 |
+
['Represent the Financial statement: ','The funds rose less than 0.5 per cent on Friday']]
|
77 |
+
embeddings_a = model.encode(sentences_a)
|
78 |
+
embeddings_b = model.encode(sentences_b)
|
79 |
+
similarities = cosine_similarity(embeddings_a,embeddings_b)
|
80 |
+
st.write(sentences_a)
|
81 |
+
st.write(sentences_b)
|
82 |
+
st.write(similarities)
|
83 |
+
|
84 |
if st.button('Cos Sim MiniLM'):
|
85 |
#title = "I Tried Using ChatGPT To Earn $6,147 In Just 1 Week"
|
86 |
#summary = "Unveiling the Reality: The Perils of Using ChatGPT for Content Generation and Monetization"
|
context.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from typing import Any
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
from transformers import AutoTokenizer, pipeline
|
6 |
+
|
7 |
+
__all__ = ["ContextAwareWordVectors", "print_results"]
|
8 |
+
|
9 |
+
|
10 |
+
class NumpyFloatValuesEncoder(json.JSONEncoder):
|
11 |
+
def default(self, obj: Any) -> Any:
|
12 |
+
if isinstance(obj, np.float32):
|
13 |
+
return round(float(obj), 3)
|
14 |
+
return json.JSONEncoder.default(self, obj)
|
15 |
+
|
16 |
+
|
17 |
+
def print_results():
|
18 |
+
with open("sentences.json", encoding="utf-8") as fp:
|
19 |
+
samples = json.load(fp)
|
20 |
+
context_aware_word_vectors = ContextAwareWordVectors(model="bert-base-uncased")
|
21 |
+
results = context_aware_word_vectors.run(samples)
|
22 |
+
print(json.dumps(results, indent=2, cls=NumpyFloatValuesEncoder))
|
23 |
+
|
24 |
+
|
25 |
+
class ContextAwareWordVectors:
|
26 |
+
def __init__(self, model: str, framework: str = "tf") -> None:
|
27 |
+
self.framework = framework
|
28 |
+
self.model = model
|
29 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
30 |
+
self.feature_extractor = pipeline(
|
31 |
+
model=model,
|
32 |
+
framework=framework,
|
33 |
+
tokenizer=self.tokenizer,
|
34 |
+
task="feature-extraction",
|
35 |
+
)
|
36 |
+
|
37 |
+
def dot_product(self, v1: Any, v2: Any) -> Any:
|
38 |
+
return round(np.dot(v1, v2), 3)
|
39 |
+
|
40 |
+
def euclidean_distance(self, v1: Any, v2: Any) -> Any:
|
41 |
+
return round(np.linalg.norm(v1 - v2), 3)
|
42 |
+
|
43 |
+
def manhattan_distance(self, v1: Any, v2: Any) -> Any:
|
44 |
+
return round(np.linalg.norm(v1 - v2, ord=1), 3)
|
45 |
+
|
46 |
+
def run(self, samples: dict[str, dict[str, str]]) -> dict[str, dict[str, Any]]:
|
47 |
+
test_word_vector: dict[str, Any]
|
48 |
+
results: dict[str, dict[str, Any]] = {}
|
49 |
+
|
50 |
+
for test_word, sample in samples.items():
|
51 |
+
results[test_word] = {}
|
52 |
+
test_word_vector = {}
|
53 |
+
for index, sentence in sample.items():
|
54 |
+
tokens = self.tokenizer.tokenize(sentence)
|
55 |
+
vectors = self.feature_extractor(sentence, return_tensors=True).numpy()
|
56 |
+
test_word_location = [
|
57 |
+
i for i in range(len(tokens)) if test_word == tokens[i]
|
58 |
+
][0]
|
59 |
+
test_word_vector[index] = vectors[
|
60 |
+
0, test_word_location + 1, :
|
61 |
+
] # 0 is '[CLS]'
|
62 |
+
magnitude = np.linalg.norm(test_word_vector[index])
|
63 |
+
test_word_vector[index] = test_word_vector[index] / magnitude
|
64 |
+
results[test_word]["sentences"] = sample
|
65 |
+
results[test_word]["dot_product"] = [
|
66 |
+
self.dot_product(test_word_vector["1"], test_word_vector["2"]),
|
67 |
+
self.dot_product(test_word_vector["2"], test_word_vector["3"]),
|
68 |
+
self.dot_product(test_word_vector["3"], test_word_vector["1"]),
|
69 |
+
]
|
70 |
+
results[test_word]["euclidean_distance"] = [
|
71 |
+
self.euclidean_distance(test_word_vector["1"], test_word_vector["2"]),
|
72 |
+
self.euclidean_distance(test_word_vector["2"], test_word_vector["3"]),
|
73 |
+
self.euclidean_distance(test_word_vector["3"], test_word_vector["1"]),
|
74 |
+
]
|
75 |
+
results[test_word]["manhattan_distance"] = [
|
76 |
+
self.manhattan_distance(test_word_vector["1"], test_word_vector["2"]),
|
77 |
+
self.manhattan_distance(test_word_vector["2"], test_word_vector["3"]),
|
78 |
+
self.manhattan_distance(test_word_vector["3"], test_word_vector["1"]),
|
79 |
+
]
|
80 |
+
return results
|
81 |
+
|
82 |
+
|
83 |
+
if __name__ == "__main__":
|
84 |
+
print_results()
|