similarity-st1 / context.py
wldmr's picture
instructor
2f47f53
import json
from typing import Any
import numpy as np
from transformers import AutoTokenizer, pipeline
__all__ = ["ContextAwareWordVectors", "print_results"]
class NumpyFloatValuesEncoder(json.JSONEncoder):
def default(self, obj: Any) -> Any:
if isinstance(obj, np.float32):
return round(float(obj), 3)
return json.JSONEncoder.default(self, obj)
def print_results():
with open("sentences.json", encoding="utf-8") as fp:
samples = json.load(fp)
context_aware_word_vectors = ContextAwareWordVectors(model="bert-base-uncased")
results = context_aware_word_vectors.run(samples)
print(json.dumps(results, indent=2, cls=NumpyFloatValuesEncoder))
class ContextAwareWordVectors:
def __init__(self, model: str, framework: str = "tf") -> None:
self.framework = framework
self.model = model
self.tokenizer = AutoTokenizer.from_pretrained(model)
self.feature_extractor = pipeline(
model=model,
framework=framework,
tokenizer=self.tokenizer,
task="feature-extraction",
)
def dot_product(self, v1: Any, v2: Any) -> Any:
return round(np.dot(v1, v2), 3)
def euclidean_distance(self, v1: Any, v2: Any) -> Any:
return round(np.linalg.norm(v1 - v2), 3)
def manhattan_distance(self, v1: Any, v2: Any) -> Any:
return round(np.linalg.norm(v1 - v2, ord=1), 3)
def run(self, samples: dict[str, dict[str, str]]) -> dict[str, dict[str, Any]]:
test_word_vector: dict[str, Any]
results: dict[str, dict[str, Any]] = {}
for test_word, sample in samples.items():
results[test_word] = {}
test_word_vector = {}
for index, sentence in sample.items():
tokens = self.tokenizer.tokenize(sentence)
vectors = self.feature_extractor(sentence, return_tensors=True).numpy()
test_word_location = [
i for i in range(len(tokens)) if test_word == tokens[i]
][0]
test_word_vector[index] = vectors[
0, test_word_location + 1, :
] # 0 is '[CLS]'
magnitude = np.linalg.norm(test_word_vector[index])
test_word_vector[index] = test_word_vector[index] / magnitude
results[test_word]["sentences"] = sample
results[test_word]["dot_product"] = [
self.dot_product(test_word_vector["1"], test_word_vector["2"]),
self.dot_product(test_word_vector["2"], test_word_vector["3"]),
self.dot_product(test_word_vector["3"], test_word_vector["1"]),
]
results[test_word]["euclidean_distance"] = [
self.euclidean_distance(test_word_vector["1"], test_word_vector["2"]),
self.euclidean_distance(test_word_vector["2"], test_word_vector["3"]),
self.euclidean_distance(test_word_vector["3"], test_word_vector["1"]),
]
results[test_word]["manhattan_distance"] = [
self.manhattan_distance(test_word_vector["1"], test_word_vector["2"]),
self.manhattan_distance(test_word_vector["2"], test_word_vector["3"]),
self.manhattan_distance(test_word_vector["3"], test_word_vector["1"]),
]
return results
if __name__ == "__main__":
print_results()