Spaces:
Sleeping
Sleeping
import json | |
from typing import Any | |
import numpy as np | |
from transformers import AutoTokenizer, pipeline | |
__all__ = ["ContextAwareWordVectors", "print_results"] | |
class NumpyFloatValuesEncoder(json.JSONEncoder): | |
def default(self, obj: Any) -> Any: | |
if isinstance(obj, np.float32): | |
return round(float(obj), 3) | |
return json.JSONEncoder.default(self, obj) | |
def print_results(): | |
with open("sentences.json", encoding="utf-8") as fp: | |
samples = json.load(fp) | |
context_aware_word_vectors = ContextAwareWordVectors(model="bert-base-uncased") | |
results = context_aware_word_vectors.run(samples) | |
print(json.dumps(results, indent=2, cls=NumpyFloatValuesEncoder)) | |
class ContextAwareWordVectors: | |
def __init__(self, model: str, framework: str = "tf") -> None: | |
self.framework = framework | |
self.model = model | |
self.tokenizer = AutoTokenizer.from_pretrained(model) | |
self.feature_extractor = pipeline( | |
model=model, | |
framework=framework, | |
tokenizer=self.tokenizer, | |
task="feature-extraction", | |
) | |
def dot_product(self, v1: Any, v2: Any) -> Any: | |
return round(np.dot(v1, v2), 3) | |
def euclidean_distance(self, v1: Any, v2: Any) -> Any: | |
return round(np.linalg.norm(v1 - v2), 3) | |
def manhattan_distance(self, v1: Any, v2: Any) -> Any: | |
return round(np.linalg.norm(v1 - v2, ord=1), 3) | |
def run(self, samples: dict[str, dict[str, str]]) -> dict[str, dict[str, Any]]: | |
test_word_vector: dict[str, Any] | |
results: dict[str, dict[str, Any]] = {} | |
for test_word, sample in samples.items(): | |
results[test_word] = {} | |
test_word_vector = {} | |
for index, sentence in sample.items(): | |
tokens = self.tokenizer.tokenize(sentence) | |
vectors = self.feature_extractor(sentence, return_tensors=True).numpy() | |
test_word_location = [ | |
i for i in range(len(tokens)) if test_word == tokens[i] | |
][0] | |
test_word_vector[index] = vectors[ | |
0, test_word_location + 1, : | |
] # 0 is '[CLS]' | |
magnitude = np.linalg.norm(test_word_vector[index]) | |
test_word_vector[index] = test_word_vector[index] / magnitude | |
results[test_word]["sentences"] = sample | |
results[test_word]["dot_product"] = [ | |
self.dot_product(test_word_vector["1"], test_word_vector["2"]), | |
self.dot_product(test_word_vector["2"], test_word_vector["3"]), | |
self.dot_product(test_word_vector["3"], test_word_vector["1"]), | |
] | |
results[test_word]["euclidean_distance"] = [ | |
self.euclidean_distance(test_word_vector["1"], test_word_vector["2"]), | |
self.euclidean_distance(test_word_vector["2"], test_word_vector["3"]), | |
self.euclidean_distance(test_word_vector["3"], test_word_vector["1"]), | |
] | |
results[test_word]["manhattan_distance"] = [ | |
self.manhattan_distance(test_word_vector["1"], test_word_vector["2"]), | |
self.manhattan_distance(test_word_vector["2"], test_word_vector["3"]), | |
self.manhattan_distance(test_word_vector["3"], test_word_vector["1"]), | |
] | |
return results | |
if __name__ == "__main__": | |
print_results() | |