import numpy as np from transformers.tokenization_bert import BertTokenizer from .f import flatten_, assoc, memoize, GetAttr from typing import List def fix_byte_spaces(toks: List[str]) -> List[str]: return [t.replace("\u0120", " ").replace("\u010A", "\\n") for t in toks] @memoize def get_bpe(bpe_pretrained_name_or_path): return BertTokenizer.from_pretrained(bpe_pretrained_name_or_path) # [String] -> [String] def remove_CLS_SEP(toks): return [t for t in toks if t not in set(["[CLS]", "[SEP]"])] # torch.Tensor -> np.Array def process_hidden_tensors(t): """Embeddings are returned from the BERT model in a non-ideal embedding shape: - unnecessary batch dimension - Undesired second sentence "[SEP]". Drop the unnecessary information and just return what we need for the first sentence """ # Drop unnecessary batch dim and second sent t = t.squeeze(0)[:-1] # Drop second sentence sep ?? t = t[1:-1] # Convert to numpy return t.data.numpy() # np.Array -> np.Array def normalize(a): """Divide each head by its norm""" norms = np.linalg.norm(a, axis=-1, keepdims=True) return a / norms # np.Array: -> np.Array def reshape(a): """Combine the last two dimensions of a numpy array""" all_head_size = a.shape[-2] * a.shape[-1] new_shape = a.shape[:-2] + (all_head_size,) return a.reshape(new_shape)