mozharovsky commited on
Commit
daf12c3
1 Parent(s): 17690fa

docs(readme): update to the latest version git_t5 0.2.3

Browse files
Files changed (1) hide show
  1. README.md +4 -64
README.md CHANGED
@@ -4,75 +4,13 @@ Pre-trained model on CodeSearchNet Python dataset using a span-masking objective
4
 
5
  # How to use
6
 
7
- You can use this model to denoise span-masked sequences. Note, that you'll need to add some boilerplate code for adding the noise to your sequences.
8
 
9
  First, install the [git-t5](https://github.com/formermagic/git-t5) pip package:
10
  ```shell
11
  > pip install git-t5
12
  ```
13
 
14
- Add the following code for encoding an input text:
15
- ```python
16
- from typing import Dict, Optional, Tuple
17
-
18
- import numpy as np
19
- import torch
20
- from transformers import PreTrainedTokenizerBase
21
-
22
- from git_t5.data import DataCollatorForT5MLM
23
-
24
-
25
- def encode(
26
- tokenizer: PreTrainedTokenizerBase,
27
- text: str,
28
- noise_density: float = 0.15,
29
- mean_noise_span_length: float = 3.0,
30
- extra_tokens_per_span_inputs: int = 1,
31
- extra_tokens_per_span_targets: int = 1,
32
- seed: Optional[int] = None,
33
- ) -> Tuple[Dict[str, torch.Tensor], int]:
34
- def compute_lengths(tokens_length: int) -> Tuple[int, int]:
35
- num_noise_tokens = int(round(tokens_length * noise_density))
36
- num_nonnoise_tokens = tokens_length - num_noise_tokens
37
- num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length))
38
- # inputs contain all nonnoise tokens, sentinels for all noise spans
39
- # and one EOS token.
40
- return (
41
- num_nonnoise_tokens + num_noise_spans * extra_tokens_per_span_inputs + 1,
42
- num_noise_tokens + num_noise_spans * extra_tokens_per_span_targets + 1,
43
- )
44
-
45
- encoding = tokenizer(
46
- text,
47
- truncation=False,
48
- return_attention_mask=False,
49
- return_length=True,
50
- )
51
-
52
- input_length = encoding.pop("length")
53
- input_length = input_length[0]
54
- input_length, target_length = compute_lengths(input_length)
55
-
56
- np.random.seed(seed)
57
-
58
- data_collator = DataCollatorForT5MLM(
59
- tokenizer=tokenizer,
60
- noise_density=noise_density,
61
- mean_noise_span_length=mean_noise_span_length,
62
- input_length=input_length,
63
- target_length=target_length,
64
- eos_token_id=tokenizer.eos_token_id,
65
- pad_token_id=tokenizer.pad_token_id,
66
- decoder_start_token_id=tokenizer.pad_token_id,
67
- sentinel_token_id=tokenizer.convert_tokens_to_ids("<extra_id_0>"),
68
- )
69
-
70
- batch = data_collator([encoding]) # type: ignore
71
- batch = {key: torch.tensor(val) for key, val in batch.items()}
72
-
73
- return batch, target_length
74
- ```
75
-
76
  Next, download the model and tokenizer:
77
  ```python
78
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,
@@ -84,6 +22,8 @@ tokenizer = AutoTokenizer.from_pretrained("formermagic/pyt5-base")
84
 
85
  Finally, encode your input and generate the output sequence:
86
  ```python
 
 
87
  text = """
88
  def alias(self, annotationtype, set, fallback=False):
89
  if inspect.isclass(annotationtype): annotationtype = annotationtype.ANNOTATIONTYPE
@@ -95,7 +35,7 @@ def alias(self, annotationtype, set, fallback=False):
95
  raise KeyError("No alias for set " + set)
96
  """
97
 
98
- batch, max_length = encode(tokenizer, text, seed=22)
99
  outputs = model.generate(batch["input_ids"], max_length=max_length, num_beams=1)
100
  print(tokenizer.batch_decode(outputs[..., 1:]))
101
  print(tokenizer.batch_decode(batch["labels"]))
4
 
5
  # How to use
6
 
7
+ You can use this model to denoise span-masked sequences.
8
 
9
  First, install the [git-t5](https://github.com/formermagic/git-t5) pip package:
10
  ```shell
11
  > pip install git-t5
12
  ```
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  Next, download the model and tokenizer:
15
  ```python
16
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,
22
 
23
  Finally, encode your input and generate the output sequence:
24
  ```python
25
+ from git_t5.utils import encode_input
26
+
27
  text = """
28
  def alias(self, annotationtype, set, fallback=False):
29
  if inspect.isclass(annotationtype): annotationtype = annotationtype.ANNOTATIONTYPE
35
  raise KeyError("No alias for set " + set)
36
  """
37
 
38
+ batch, max_length = encode_input(tokenizer, text, seed=22)
39
  outputs = model.generate(batch["input_ids"], max_length=max_length, num_beams=1)
40
  print(tokenizer.batch_decode(outputs[..., 1:]))
41
  print(tokenizer.batch_decode(batch["labels"]))