dsilin commited on
Commit
30ff282
1 Parent(s): 05f5962

added example run code

Browse files
Files changed (1) hide show
  1. README.md +90 -0
README.md CHANGED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This model can be used to more accurately detokenize the moses tokenizer (it does a better job with certain lossy quotes and things)
2
+
3
+
4
+ batched usage:
5
+
6
+ ```
7
+
8
+ sentences = [
9
+ "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,",
10
+ "\" We 'll talk go by now ; \" says Shucksmith ;",
11
+ "He 'll enjoy it more now that this he be dead , if put 'll pardon the expression .",
12
+ "I think you 'll be amazed at this way it finds ,",
13
+ "Michigan voters ^ are so frightened of fallen in permanent economic collapse that they 'll grab onto anything .",
14
+ "You 'll finding outs episode 4 .",
15
+ "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said .",
16
+ "You can look at the things I 'm saying about my record and about the events of campaign and history and you 'll find if now and and then I miss a words or I get something slightly off , I 'll correct it , acknowledge where it are wrong .",
17
+ "Wonder if 'll alive to see .",
18
+ "We 'll have to combine and a numbered of people ."
19
+ ]
20
+
21
+ def sentences_to_input_tokens(sentences):
22
+ all_tokens = []
23
+ max_length = 0
24
+ sents_tokens = []
25
+ iids = tokenizer(sentences)
26
+ for sent_tokens in iids['input_ids']:
27
+ # sent_tokens = tokenizer.build_inputs_with_special_tokens(
28
+ # tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence)))
29
+
30
+ sents_tokens.append(sent_tokens)
31
+
32
+ if len(sent_tokens) > max_length:
33
+ max_length = len(sent_tokens)
34
+
35
+ attention_mask = [1] * len(sent_tokens)
36
+ pos_ids = list(range(len(sent_tokens)))
37
+
38
+ encoding = {
39
+ "iids": sent_tokens,
40
+ "am": attention_mask,
41
+ "pos": pos_ids
42
+ }
43
+
44
+ all_tokens.append(encoding)
45
+
46
+ input_ids = []
47
+ attention_masks = []
48
+ position_ids = []
49
+ for i in range(len(all_tokens)):
50
+
51
+ encoding = all_tokens[i]
52
+
53
+ pad_len = max_length - len(encoding['iids'])
54
+ attention_masks.append(encoding['am'] + [0] * pad_len)
55
+ position_ids.append(encoding['pos'] + [0] * pad_len)
56
+ input_ids.append(encoding['iids'] + [tokenizer.pad_token_id] * pad_len)
57
+
58
+ encoding = {
59
+ "input_ids": torch.tensor(input_ids).to(device),
60
+ "attention_mask": torch.tensor(attention_masks).to(device),
61
+ "position_ids": torch.tensor(position_ids).to(device)
62
+ }
63
+
64
+ return encoding, sents_tokens
65
+
66
+ def run_token_predictor_sentences(sentences):
67
+ encoding, at = sentences_to_input_tokens(sentences)
68
+ predictions = model(**encoding)[0].cpu().tolist()
69
+ outstrs = []
70
+
71
+ for i in range(len(predictions)):
72
+ outstr = ""
73
+ for p in zip(tokenizer.convert_ids_to_tokens(at[i][1:-1]), predictions[i][1:-1]):
74
+ if not "▁" in p[0]:
75
+ outstr+=p[0]
76
+ else:
77
+ if p[1][0] > p[1][1]:
78
+ outstr+=p[0].replace("▁", " ")
79
+ else:
80
+ outstr+=p[0].replace("▁", "")
81
+ outstrs.append(outstr.strip())
82
+ return outstrs
83
+
84
+ outs = run_token_predictor_sentences(sentences)
85
+ for p in zip(outs, sentences):
86
+ print(p[1])
87
+ print(p[0])
88
+ print('\n------\n')
89
+
90
+ ```