Benjamin Hoover commited on
Commit
a283b22
1 Parent(s): 34b8a50

Simplify model_api in server

Browse files
environment.yml CHANGED
@@ -5,21 +5,14 @@ channels:
5
  - defaults
6
  - anaconda
7
  dependencies:
 
8
  - python=3.7
9
  - connexion=1.5.3
10
- - h5py
11
- - spacy
12
  - boto3
13
  - regex
14
  - flask-cors
15
- - faiss-cpu
16
  - jinja2=2.10
17
- - numpy=1.16.2
18
- - olefile=0.46
19
- - pickleshare=0.7.5
20
- - pillow=5.4.1
21
- - pip=19.0.3
22
- - pytorch=1.0.1
23
- - sacremoses
24
- - pip:
25
- - sentencepiece
 
5
  - defaults
6
  - anaconda
7
  dependencies:
8
+ - pip>=19.0.3
9
  - python=3.7
10
  - connexion=1.5.3
 
 
11
  - boto3
12
  - regex
13
  - flask-cors
 
14
  - jinja2=2.10
15
+ - numpy
16
+ - pytorch
17
+ - torchvision
18
+ - transformers
 
 
 
 
 
server/model_api.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union, Tuple
2
+
3
+ import torch
4
+ from transformers import AutoConfig, AutoTokenizer, AutoModelWithLMHead, AutoModel
5
+
6
+ from transformer_formatter import TransformerOutputFormatter
7
+ from utils.f import delegates, pick, memoize
8
+
9
+ def get_model_tok(mname):
10
+ conf = AutoConfig.from_pretrained(mname, output_attentions=True, output_past=False)
11
+ tok = AutoTokenizer.from_pretrained(mname, config=conf)
12
+ model = AutoModelWithLMHead.from_pretrained(mname, config=conf)
13
+ return model, tok
14
+
15
+ class ModelDetails:
16
+ """Wraps a transformer model and tokenizer to prepare inputs to the frontend visualization"""
17
+ def __init__(self, mname):
18
+ self.mname = mname
19
+ self.model, self.tok = get_model_tok(self.mname)
20
+ self.model.eval()
21
+ self.config = self.model.config
22
+
23
+ def from_sentence(self, sentence: str) -> TransformerOutputFormatter:
24
+ """Get attentions and word probabilities from a sentence. Special tokens are automatically added if a sentence is passed.
25
+
26
+ Args:
27
+ sentence: The input sentence to tokenize and analyze.
28
+ """
29
+ tokens = self.tok.tokenize(sentence)
30
+
31
+ return self.from_tokens(tokens, sentence, add_special_tokens=True)
32
+
33
+ def from_tokens(
34
+ self, tokens: List[str], orig_sentence:str, add_special_tokens:bool=False, mask_attentions:bool=False, topk:int=5
35
+ ) -> TransformerOutputFormatter:
36
+ """Get formatted attention and predictions from a list of tokens.
37
+
38
+ Args:
39
+ tokens: Tokens to analyze
40
+ orig_sentence: The sentence the tokens came from (needed to help organize the output)
41
+ add_special_tokens: Whether to add special tokens like CLS / <|endoftext|> to the tokens.
42
+ If False, assume the tokens already have the special tokens
43
+ mask_attentions: If True, do not pay attention to attention patterns to special tokens through the model.
44
+ topk: How many top predictions to report
45
+ """
46
+ ids = self.tok.convert_tokens_to_ids(tokens)
47
+
48
+ # For GPT2, add the beginning of sentence token to the input. Note that this will work on all models but XLM
49
+ bost = self.tok.bos_token_id
50
+ clst = self.tok.cls_token_id
51
+ if (bost is not None) and (bost != clst) and add_special_tokens:
52
+ ids.insert(0, bost)
53
+
54
+ inputs = self.tok.prepare_for_model(ids, add_special_tokens=add_special_tokens, return_tensors="pt")
55
+ parsed_input = self.parse_inputs(inputs, mask_attentions=mask_attentions)
56
+ output = self.model(parsed_input['input_ids'], attention_mask=parsed_input['attention_mask'])
57
+
58
+ logits, atts = self.choose_logits_att(output)
59
+ words, probs = self.logits2words(logits, topk)
60
+ tokens = self.view_ids(inputs["input_ids"])
61
+
62
+ formatted_output = TransformerOutputFormatter(
63
+ orig_sentence,
64
+ tokens,
65
+ inputs["special_tokens_mask"],
66
+ atts,
67
+ words,
68
+ probs.tolist(),
69
+ self.config
70
+ )
71
+
72
+ return formatted_output
73
+
74
+ def choose_logits_att(self, out:Tuple) -> Tuple:
75
+ """Select from the model's output the logits and the attentions, switching on model name
76
+
77
+ Args:
78
+ out: Output from the model's forward pass
79
+
80
+ Returns:
81
+ (logits: tensor((bs, N)), attentions: Tuple[tensor(())])
82
+ """
83
+ if 't5' in self.mname:
84
+ logits, _, atts = out
85
+ else:
86
+ logits, atts = out
87
+
88
+ print("Logits: ", logits)
89
+ print("atts: ", atts[0].shape)
90
+ return logits, atts
91
+
92
+ def logits2words(self, logits, topk):
93
+ """Convert logit probabilities into words from the tokenizer's vocabulary.
94
+
95
+ """
96
+ probs, idxs = torch.topk(torch.softmax(logits.squeeze(0), 1), topk)
97
+ words = [self.tok.convert_ids_to_tokens(i) for i in idxs]
98
+ return words, probs
99
+
100
+ def view_ids(self, ids: Union[List[int], torch.Tensor]) -> List[str]:
101
+ """View what the tokenizer thinks certain ids are for a single input"""
102
+ if type(ids) == torch.Tensor:
103
+ # Remove batch dimension
104
+ ids = ids.squeeze(0).tolist()
105
+
106
+ out = self.tok.convert_ids_to_tokens(ids)
107
+ return out
108
+
109
+ def parse_inputs(self, inputs, mask_attentions=False):
110
+ """Parse the output from `tokenizer.prepare_for_model` to the desired attention mask from special tokens
111
+
112
+ Args:
113
+ - inputs: The output of `tokenizer.prepare_for_model`.
114
+ A dict with keys: {'special_token_mask', 'token_type_ids', 'input_ids'}
115
+ - mask_attentions: Flag indicating whether to mask the attentions or not
116
+
117
+ Returns:
118
+ Dict with keys: {'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'}
119
+
120
+ Usage:
121
+
122
+ ```
123
+ s = "test sentence"
124
+
125
+ # from raw sentence to tokens
126
+ tokens = tokenizer.tokenize(s)
127
+
128
+ # From tokens to ids
129
+ ids = tokenizer.convert_tokens_to_ids(tokens)
130
+
131
+ # From ids to input
132
+ inputs = tokenizer.prepare_for_model(ids, return_tensors='pt')
133
+
134
+ # Parse the input. Optionally mask the special tokens from the analysis.
135
+ parsed_input = parse_inputs(inputs)
136
+
137
+ # Run the model, pick from this output whatever inputs you want
138
+ from utils.f import pick
139
+ out = model(**pick(['input_ids'], parse_inputs(inputs)))
140
+ ```
141
+ """
142
+
143
+ out = inputs.copy()
144
+
145
+ # DEFINE SPECIAL TOKENS MASK
146
+ if "special_tokens_mask" not in inputs.keys():
147
+ special_tokens = set([self.tok.unk_token_id, self.tok.cls_token_id, self.tok.sep_token_id, self.tok.bos_token_id, self.tok.eos_token_id, self.tok.pad_token_id])
148
+ in_ids = inputs['input_ids'][0]
149
+ special_tok_mask = [1 if int(i) in special_tokens else 0 for i in in_ids]
150
+ inputs['special_tokens_mask'] = special_tok_mask
151
+
152
+ if mask_attentions:
153
+ out["attention_mask"] = torch.tensor(
154
+ [int(not i) for i in inputs.get("special_tokens_mask")]
155
+ ).unsqueeze(0)
156
+ else:
157
+ out["attention_mask"] = torch.tensor(
158
+ [1 for i in inputs.get("special_tokens_mask")]
159
+ ).unsqueeze(0)
160
+
161
+ return out
server/setup.py CHANGED
@@ -3,8 +3,8 @@ from setuptools import setup, find_packages
3
  requires = [] # Let conda handle requires
4
 
5
  setup(
6
- name="exbert",
7
- description="Vis",
8
  packages=find_packages(),
9
  author="IBM Research AI",
10
  include_package_data=True,
 
3
  requires = [] # Let conda handle requires
4
 
5
  setup(
6
+ name="exformer",
7
+ description="Just the attention vis of exbert",
8
  packages=find_packages(),
9
  author="IBM Research AI",
10
  include_package_data=True,
server/transformer_formatter.py CHANGED
@@ -4,7 +4,6 @@ import numpy as np
4
  import torch
5
  import json
6
 
7
- from spacyface.simple_spacy_token import SimpleSpacyToken
8
  from utils.token_processing import fix_byte_spaces
9
  from utils.gen_utils import map_nlist
10
 
@@ -14,8 +13,8 @@ def round_return_value(attentions, ndigits=5):
14
 
15
  attentions: {
16
  'aa': {
17
- left.embeddings & contexts
18
- right.embeddings & contexts
19
  att
20
  }
21
  }
@@ -25,19 +24,6 @@ def round_return_value(attentions, ndigits=5):
25
  nested_rounder = partial(map_nlist, rounder)
26
  new_out = attentions # Modify values to save memory
27
  new_out["aa"]["att"] = nested_rounder(attentions["aa"]["att"])
28
- new_out["aa"]["left"]["embeddings"] = nested_rounder(
29
- attentions["aa"]["left"]["embeddings"]
30
- )
31
- new_out["aa"]["left"]["contexts"] = nested_rounder(
32
- attentions["aa"]["left"]["contexts"]
33
- )
34
-
35
- new_out["aa"]["right"]["embeddings"] = nested_rounder(
36
- attentions["aa"]["right"]["embeddings"]
37
- )
38
- new_out["aa"]["right"]["contexts"] = nested_rounder(
39
- attentions["aa"]["right"]["contexts"]
40
- )
41
 
42
  return new_out
43
 
@@ -60,71 +46,40 @@ class TransformerOutputFormatter:
60
  def __init__(
61
  self,
62
  sentence: str,
63
- tokens: List[SimpleSpacyToken],
64
  special_tokens_mask: List[int],
65
  att: Tuple[torch.Tensor],
66
- embeddings: Tuple[torch.Tensor],
67
- contexts: Tuple[torch.Tensor],
68
  topk_words: List[List[str]],
69
- topk_probs: List[List[float]]
 
70
  ):
71
  assert len(tokens) > 0, "Cannot have an empty token output!"
72
 
73
- modified_embeddings = flatten_batch(embeddings)
74
  modified_att = flatten_batch(att)
75
- modified_contexts = flatten_batch(contexts)
76
 
77
  self.sentence = sentence
78
  self.tokens = tokens
79
  self.special_tokens_mask = special_tokens_mask
80
- self.embeddings = modified_embeddings
81
  self.attentions = modified_att
82
- self.raw_contexts = modified_contexts
83
  self.topk_words = topk_words
84
  self.topk_probs = topk_probs
 
85
 
86
- self.n_layers = len(contexts) # With +1 for buffer layer at the beginning
87
- _, self.__len, self.n_heads, self.hidden_dim = contexts[0].shape
88
-
89
- @property
90
- def contexts(self):
91
- """Combine the head and the context dimension as it is passed forward in the model"""
92
- return squeeze_contexts(self.raw_contexts)
93
-
94
- @property
95
- def normed_embeddings(self):
96
- ens = tuple([torch.norm(e, dim=-1) for e in self.embeddings])
97
- normed_es = tuple([e / en.unsqueeze(-1) for e, en in zip(self.embeddings, ens)])
98
- return normed_es
99
-
100
- @property
101
- def normed_contexts(self):
102
- """Normalize each by head"""
103
- cs = self.raw_contexts
104
- cns = tuple([torch.norm(c, dim=-1) for c in cs])
105
- normed_cs = tuple([c / cn.unsqueeze(-1) for c, cn in zip(cs, cns)])
106
- squeezed_normed_cs = squeeze_contexts(normed_cs)
107
- return squeezed_normed_cs
108
 
109
  def to_json(self, layer:int, ndigits=5):
110
  """The original API expects the following response:
111
 
112
  aa: {
113
  att: number[][][]
114
- left: <FullSingleTokenInfo[]>
115
- right: <FullSingleTokenInfo[]>
116
  }
117
-
118
- FullSingleTokenInfo:
119
- {
120
- text: string
121
- embeddings: number[]
122
- contexts: number[]
123
- bpe_token: string
124
- bpe_pos: string
125
- bpe_dep: string
126
- bpe_is_ent: boolean
127
- }
128
  """
129
  # Convert the embeddings, attentions, and contexts into list. Perform rounding
130
 
@@ -133,25 +88,16 @@ class TransformerOutputFormatter:
133
 
134
  def tolist(tens): return [t.tolist() for t in tens]
135
 
136
- def to_resp(tok: SimpleSpacyToken, embeddings: List[float], contexts: List[float], topk_words, topk_probs):
137
  return {
138
- "text": tok.token,
139
- "bpe_token": tok.token,
140
- "bpe_pos": tok.pos,
141
- "bpe_dep": tok.dep,
142
- "bpe_is_ent": tok.is_ent,
143
- "embeddings": nested_rounder(embeddings),
144
- "contexts": nested_rounder(contexts),
145
  "topk_words": topk_words,
146
  "topk_probs": nested_rounder(topk_probs)
147
  }
148
 
149
- side_info = [to_resp(t, e, c, w, p) for t,e,c,w,p in zip(
150
- self.tokens,
151
- tolist(self.embeddings[layer]),
152
- tolist(self.contexts[layer]),
153
- self.topk_words,
154
- self.topk_probs)]
155
 
156
  out = {"aa": {
157
  "att": nested_rounder(tolist(self.attentions[layer])),
@@ -164,42 +110,6 @@ class TransformerOutputFormatter:
164
  def display_tokens(self, tokens):
165
  return fix_byte_spaces(tokens)
166
 
167
- def to_hdf5_meta(self):
168
- """Output metadata information to store as hdf5 metadata for a group"""
169
- token_dtype = self.tokens[0].hdf5_token_dtype
170
- out = {k: np.array([t[k] for t in self.tokens], dtype=np.dtype(dtype)) for k, dtype in token_dtype}
171
- out['sentence'] = self.sentence
172
- return out
173
-
174
- def to_hdf5_content(self, do_norm=True):
175
- """Return dictionary of {attentions, embeddings, contexts} formatted as array for hdf5 file"""
176
-
177
- def get_embeds(c):
178
- if do_norm: return c.normed_embeddings
179
- return c.embeddings
180
-
181
- def get_contexts(c):
182
- if do_norm: return c.normed_contexts
183
- return c.contexts
184
-
185
- embeddings = to_numpy(get_embeds(self))
186
- contexts = to_numpy(get_contexts(self))
187
- atts = to_numpy(self.attentions)
188
-
189
- return {
190
- "embeddings": embeddings,
191
- "contexts": contexts,
192
- "attentions": atts
193
- }
194
-
195
- @property
196
- def searchable_embeddings(self):
197
- return np.array(list(map(to_searchable, self.embeddings)))
198
-
199
- @property
200
- def searchable_contexts(self):
201
- return np.array(list(map(to_searchable, self.contexts)))
202
-
203
  def __repr__(self):
204
  lim = 50
205
  if len(self.sentence) > lim: s = self.sentence[:lim - 3] + "..."
 
4
  import torch
5
  import json
6
 
 
7
  from utils.token_processing import fix_byte_spaces
8
  from utils.gen_utils import map_nlist
9
 
 
13
 
14
  attentions: {
15
  'aa': {
16
+ left
17
+ right
18
  att
19
  }
20
  }
 
24
  nested_rounder = partial(map_nlist, rounder)
25
  new_out = attentions # Modify values to save memory
26
  new_out["aa"]["att"] = nested_rounder(attentions["aa"]["att"])
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  return new_out
29
 
 
46
  def __init__(
47
  self,
48
  sentence: str,
49
+ tokens: List[str],
50
  special_tokens_mask: List[int],
51
  att: Tuple[torch.Tensor],
 
 
52
  topk_words: List[List[str]],
53
+ topk_probs: List[List[float]],
54
+ model_config
55
  ):
56
  assert len(tokens) > 0, "Cannot have an empty token output!"
57
 
 
58
  modified_att = flatten_batch(att)
 
59
 
60
  self.sentence = sentence
61
  self.tokens = tokens
62
  self.special_tokens_mask = special_tokens_mask
 
63
  self.attentions = modified_att
 
64
  self.topk_words = topk_words
65
  self.topk_probs = topk_probs
66
+ self.model_config = model_config
67
 
68
+ self.n_layer = self.model_config.n_layer
69
+ self.n_head = self.model_config.n_head
70
+ self.hidden_dim = self.model_config.n_embd
71
+
72
+ self.__len = len(tokens)# Get the number of tokens in the input
73
+ assert self.__len == self.attentions[0].shape[-1], "Attentions don't represent the passed tokens!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  def to_json(self, layer:int, ndigits=5):
76
  """The original API expects the following response:
77
 
78
  aa: {
79
  att: number[][][]
80
+ left: List[str]
81
+ right: List[str]
82
  }
 
 
 
 
 
 
 
 
 
 
 
83
  """
84
  # Convert the embeddings, attentions, and contexts into list. Perform rounding
85
 
 
88
 
89
  def tolist(tens): return [t.tolist() for t in tens]
90
 
91
+ def to_resp(tok: str, topk_words, topk_probs):
92
  return {
93
+ "text": tok,
 
 
 
 
 
 
94
  "topk_words": topk_words,
95
  "topk_probs": nested_rounder(topk_probs)
96
  }
97
 
98
+ side_info = [to_resp(t, w, p) for t,w,p in zip( self.tokens,
99
+ self.topk_words,
100
+ self.topk_probs)]
 
 
 
101
 
102
  out = {"aa": {
103
  "att": nested_rounder(tolist(self.attentions[layer])),
 
110
  def display_tokens(self, tokens):
111
  return fix_byte_spaces(tokens)
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def __repr__(self):
114
  lim = 50
115
  if len(self.sentence) > lim: s = self.sentence[:lim - 3] + "..."
server/utils/gen_utils.py CHANGED
@@ -1,15 +1,8 @@
1
- import spacy
2
  from copy import deepcopy
3
  import numpy as np
4
  from functools import partial
5
  from .f import memoize
6
 
7
- def add_base_exceptions(language_exceptions):
8
- merged = {}
9
- merged.update(language_exceptions)
10
- merged.update(spacy.lang.tokenizer_exceptions.BASE_EXCEPTIONS)
11
- return merged
12
-
13
  def check_key_len(d, length):
14
  for k, v in d.items():
15
  if len(v) != length:
 
 
1
  from copy import deepcopy
2
  import numpy as np
3
  from functools import partial
4
  from .f import memoize
5
 
 
 
 
 
 
 
6
  def check_key_len(d, length):
7
  for k, v in d.items():
8
  if len(v) != length:
server/utils/token_processing.py CHANGED
@@ -1,191 +1,16 @@
1
- """Defines the important metadata to extract for each token.
2
-
3
- If adding more metadata, modify the definitions in `to_spacy_meta` and `meta_to_hdf5`
4
- """
5
- import h5py
6
  import numpy as np
7
- import spacy
8
  from transformers.tokenization_bert import BertTokenizer
9
  from .f import flatten_, assoc, memoize, GetAttr
10
 
11
  from typing import List
12
 
13
  def fix_byte_spaces(toks: List[str]) -> List[str]:
14
- return [t.replace("\u0120", " ") for t in toks]
15
-
16
- # NOTE: If you want to change anything that is extracted from the SPACY token, change the functions below.
17
- # ====================================================================================================
18
- def simplify_spacy_token(t):
19
- """Extract important information from spacy token into a simple dictionary"""
20
- def check_ent(tok):
21
- OUT_OF_ENT = 2
22
- NO_ENT_DEFINED = 0
23
- return tok.ent_iob != OUT_OF_ENT and tok.ent_iob != NO_ENT_DEFINED
24
-
25
- return {
26
- "token": t.text,
27
- "pos": t.pos_,
28
- "dep": t.dep_,
29
- "norm": t.norm_,
30
- "tag": t.tag_,
31
- "lemma": t.lemma_,
32
- "head": t.head,
33
- "is_ent": check_ent(t),
34
- }
35
-
36
- def null_token_filler(token_text):
37
- return {
38
- "token": token_text,
39
- "pos": None,
40
- "dep": None,
41
- "norm": None,
42
- "tag": None,
43
- "lemma": None,
44
- "head": None,
45
- "is_ent": None,
46
- }
47
-
48
- token_dtype = [
49
- ("token", h5py.special_dtype(vlen=str)),
50
- ("pos", h5py.special_dtype(vlen=str)),
51
- ("dep", h5py.special_dtype(vlen=str)),
52
- ("norm", h5py.special_dtype(vlen=str)),
53
- ("tag", h5py.special_dtype(vlen=str)),
54
- ("lemma", h5py.special_dtype(vlen=str)),
55
- ("head", h5py.special_dtype(vlen=str)),
56
- ("is_ent", np.bool_),
57
- ]
58
- # ====================================================================================================
59
 
60
  @memoize
61
  def get_bpe(bpe_pretrained_name_or_path):
62
  return BertTokenizer.from_pretrained(bpe_pretrained_name_or_path)
63
 
64
- @memoize
65
- def get_spacy(spacy_name):
66
- return spacy.load(spacy_name)
67
-
68
- class TokenAligner:
69
- def __init__(
70
- self,
71
- bpe_pretrained_name_or_path="bert-base-uncased",
72
- spacy_name="en_core_web_sm",
73
- ):
74
- """Create a wrapper around a sentence such that the spacy and BPE tokens can be aligned"""
75
- self.bpe = get_bpe(bpe_pretrained_name_or_path)
76
- self.nlp = get_spacy(spacy_name)
77
-
78
- def fix_sentence(self, s):
79
- return " ".join(self.to_spacy(s))
80
-
81
- def to_spacy(self, s):
82
- """Convert a sentence to spacy tokens.
83
-
84
- Note that all contractions are removed in lieu of the word they shorten by taking the 'norm' of the word as defined by spacy.
85
- """
86
- doc = self.nlp(s)
87
- tokens = [t.norm_ for t in doc]
88
- return tokens
89
-
90
- def to_spacy_text(self, s):
91
- """Convert a sentence into the raw tokens as spacy would.
92
-
93
- No contraction expansion."""
94
- doc = self.nlp(s)
95
- tokens = [t.text for t in doc]
96
- return tokens
97
-
98
- def to_bpe(self, s):
99
- """Convert a sentence to bpe tokens"""
100
- s = self.fix_sentence(s)
101
- s = self.to_bpe_text(s)
102
- return s
103
-
104
- def to_bpe_text(self, s):
105
- """Convert a sentence to bpe tokens"""
106
- return self.bpe.tokenize(s)
107
-
108
- def to_spacy_meta(self, s):
109
- """Convert a sentence to spacy tokens with important metadata"""
110
- doc = self.nlp(s)
111
- out = [simplify_spacy_token(t) for t in doc]
112
- return out
113
-
114
- def meta_to_hdf5(self, meta):
115
- out_dtype = np.dtype(token_dtype)
116
-
117
- out = [tuple([m[d[0]] for d in token_dtype]) for m in meta]
118
- return np.array(out, dtype=out_dtype)
119
-
120
- def meta_hdf5_to_obj(self, meta_hdf5):
121
- assert len(meta_hdf5) != 0
122
-
123
- keys = meta_hdf5[0].dtype.names
124
- out = {k: [] for k in keys}
125
-
126
- for m in meta_hdf5:
127
- for k in m.dtype.names:
128
- out[k].append(m[k])
129
- return out
130
-
131
- def to_spacy_hdf5(self, s):
132
- """Get values for hdf5 store, each row being a tuple of the information desired"""
133
- meta = self.to_spacy_meta(s)
134
- return self.meta_to_hdf5(meta)
135
-
136
- def to_spacy_hdf5_by_col(self, s):
137
- """Get values for hdf5 store, organized as a dictionary into the metadata"""
138
- h5_info = self.to_spacy_hdf5(s)
139
- return self.meta_hdf5_to_obj(h5_info)
140
-
141
- def bpe_from_meta_single(self, meta_token):
142
- """Split a single spacy token with metadata into bpe tokens"""
143
-
144
- bpe_tokens = self.to_bpe(meta_token["norm"])
145
-
146
- # print(bpe_tokens)
147
- return [assoc("token", b, meta_token) for b in bpe_tokens]
148
-
149
- def bpe_from_spacy_meta(self, spacy_meta):
150
- out = flatten_([self.bpe_from_meta_single(sm) for sm in spacy_meta])
151
- return out
152
-
153
- def to_bpe_meta(self, s):
154
- """Convert a sentence to bpe tokens with metadata
155
-
156
- Removes all known contractions from input sentence `s`
157
- """
158
- bpe = self.to_bpe(s)
159
- spacy_meta = self.to_spacy_meta(s)
160
- return self.bpe_from_spacy_meta(spacy_meta)
161
-
162
- def to_bpe_meta_from_tokens(self, sentence, bpe_tokens):
163
- """Get the normal BPE metadata, and add nulls wherever a special_token appears"""
164
- bpe_meta = self.to_bpe_meta(sentence)
165
-
166
- new_bpe_meta = []
167
- j = 0
168
- for i, b in enumerate(bpe_tokens):
169
- if b in self.bpe.all_special_tokens:
170
- new_bpe_meta.append(null_token_filler(b))
171
- else:
172
- new_bpe_meta.append(bpe_meta[j])
173
- j += 1
174
-
175
- return new_bpe_meta
176
-
177
- def to_bpe_hdf5(self, s):
178
- """Format the metadata of a BPE tokenized setence into hdf5 format"""
179
- meta = self.to_bpe_meta(s)
180
- return self.meta_to_hdf5(meta)
181
-
182
- def to_bpe_hdf5_by_col(self, s):
183
- h5_info = self.to_bpe_hdf5(s)
184
- return self.meta_hdf5_to_obj(h5_info)
185
-
186
- def meta_tokenize(self, s):
187
- return self.to_bpe_meta(s)
188
-
189
  # [String] -> [String]
190
  def remove_CLS_SEP(toks):
191
  return [t for t in toks if t not in set(["[CLS]", "[SEP]"])]
 
 
 
 
 
 
1
  import numpy as np
 
2
  from transformers.tokenization_bert import BertTokenizer
3
  from .f import flatten_, assoc, memoize, GetAttr
4
 
5
  from typing import List
6
 
7
  def fix_byte_spaces(toks: List[str]) -> List[str]:
8
+ return [t.replace("\u0120", " ").replace("\u010A", "\\n") for t in toks]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  @memoize
11
  def get_bpe(bpe_pretrained_name_or_path):
12
  return BertTokenizer.from_pretrained(bpe_pretrained_name_or_path)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # [String] -> [String]
15
  def remove_CLS_SEP(toks):
16
  return [t for t in toks if t not in set(["[CLS]", "[SEP]"])]