lbourdois commited on
Commit
85ae29b
1 Parent(s): 40bc550

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +1 -93
README.md CHANGED
@@ -1,95 +1,3 @@
1
  ---
2
- language: english
3
- widget:
4
- - text: "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,"
5
- - text: "\" We 'll talk go by now ; \" says Shucksmith ;"
6
- - text: "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said ."
7
  ---
8
-
9
- This model can be used to more accurately detokenize the moses tokenizer (it does a better job with certain lossy quotes and things)
10
-
11
-
12
- batched usage:
13
-
14
- ```python
15
-
16
- sentences = [
17
- "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,",
18
- "\" We 'll talk go by now ; \" says Shucksmith ;",
19
- "He 'll enjoy it more now that this he be dead , if put 'll pardon the expression .",
20
- "I think you 'll be amazed at this way it finds ,",
21
- "Michigan voters ^ are so frightened of fallen in permanent economic collapse that they 'll grab onto anything .",
22
- "You 'll finding outs episode 4 .",
23
- "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said .",
24
- "You can look at the things I 'm saying about my record and about the events of campaign and history and you 'll find if now and and then I miss a words or I get something slightly off , I 'll correct it , acknowledge where it are wrong .",
25
- "Wonder if 'll alive to see .",
26
- "We 'll have to combine and a numbered of people ."
27
- ]
28
-
29
- def sentences_to_input_tokens(sentences):
30
- all_tokens = []
31
- max_length = 0
32
- sents_tokens = []
33
- iids = tokenizer(sentences)
34
- for sent_tokens in iids['input_ids']:
35
- sents_tokens.append(sent_tokens)
36
-
37
- if len(sent_tokens) > max_length:
38
- max_length = len(sent_tokens)
39
-
40
- attention_mask = [1] * len(sent_tokens)
41
- pos_ids = list(range(len(sent_tokens)))
42
-
43
- encoding = {
44
- "iids": sent_tokens,
45
- "am": attention_mask,
46
- "pos": pos_ids
47
- }
48
-
49
- all_tokens.append(encoding)
50
-
51
- input_ids = []
52
- attention_masks = []
53
- position_ids = []
54
- for i in range(len(all_tokens)):
55
-
56
- encoding = all_tokens[i]
57
-
58
- pad_len = max_length - len(encoding['iids'])
59
- attention_masks.append(encoding['am'] + [0] * pad_len)
60
- position_ids.append(encoding['pos'] + [0] * pad_len)
61
- input_ids.append(encoding['iids'] + [tokenizer.pad_token_id] * pad_len)
62
-
63
- encoding = {
64
- "input_ids": torch.tensor(input_ids).to(device),
65
- "attention_mask": torch.tensor(attention_masks).to(device),
66
- "position_ids": torch.tensor(position_ids).to(device)
67
- }
68
-
69
- return encoding, sents_tokens
70
-
71
- def run_token_predictor_sentences(sentences):
72
- encoding, at = sentences_to_input_tokens(sentences)
73
- predictions = model(**encoding)[0].cpu().tolist()
74
- outstrs = []
75
-
76
- for i in range(len(predictions)):
77
- outstr = ""
78
- for p in zip(tokenizer.convert_ids_to_tokens(at[i][1:-1]), predictions[i][1:-1]):
79
- if not "▁" in p[0]:
80
- outstr+=p[0]
81
- else:
82
- if p[1][0] > p[1][1]:
83
- outstr+=p[0].replace("▁", " ")
84
- else:
85
- outstr+=p[0].replace("▁", "")
86
- outstrs.append(outstr.strip())
87
- return outstrs
88
-
89
- outs = run_token_predictor_sentences(sentences)
90
- for p in zip(outs, sentences):
91
- print(p[1])
92
- print(p[0])
93
- print('\n------\n')
94
-
95
- ```
 
1
  ---
2
+ language: en
 
 
 
 
3
  ---