claudios commited on
Commit
2274e74
1 Parent(s): e53a723

Add custom tokenizer

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "pad_token": "<pad>"
3
+ }
tokenization_vulberta.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from tokenizers import NormalizedString, PreTokenizedString
4
+ from tokenizers.pre_tokenizers import PreTokenizer
5
+ from transformers import PreTrainedTokenizerFast
6
+
7
+ try:
8
+ from clang import cindex
9
+ except ModuleNotFoundError as e:
10
+ raise ModuleNotFoundError(
11
+ "VulBERTa Clang tokenizer requires `libclang`. Please install it via `pip install libclang`.",
12
+ ) from e
13
+
14
+
15
+ class ClangPreTokenizer:
16
+ cidx = cindex.Index.create()
17
+
18
+ def clang_split(
19
+ self,
20
+ i: int,
21
+ normalized_string: NormalizedString,
22
+ ) -> List[NormalizedString]:
23
+ tok = []
24
+ tu = self.cidx.parse(
25
+ "tmp.c",
26
+ args=[""],
27
+ unsaved_files=[("tmp.c", str(normalized_string.original))],
28
+ options=0,
29
+ )
30
+ for t in tu.get_tokens(extent=tu.cursor.extent):
31
+ spelling = t.spelling.strip()
32
+ if spelling == "":
33
+ continue
34
+ tok.append(NormalizedString(spelling))
35
+ return tok
36
+
37
+ def pre_tokenize(self, pretok: PreTokenizedString):
38
+ pretok.split(self.clang_split)
39
+
40
+
41
+ class VulBERTaTokenizer(PreTrainedTokenizerFast):
42
+ def __init__(
43
+ self,
44
+ *args,
45
+ **kwargs,
46
+ ):
47
+ super().__init__(
48
+ *args,
49
+ **kwargs,
50
+ )
51
+ self._tokenizer.pre_tokenizer = PreTokenizer.custom(ClangPreTokenizer())
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "1": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ }
11
+ },
12
+ "clean_up_tokenization_spaces": true,
13
+ "max_length": 1024,
14
+ "model_max_length": 1024,
15
+ "pad_to_multiple_of": null,
16
+ "pad_token": "<pad>",
17
+ "pad_token_type_id": 0,
18
+ "padding_side": "right",
19
+ "stride": 0,
20
+ "tokenizer_class": "VulBERTaTokenizer",
21
+ "auto_map": {
22
+ "AutoTokenizer": ["tokenization_vulberta.VulBERTaTokenizer", null]
23
+ },
24
+ "truncation_side": "right",
25
+ "truncation_strategy": "longest_first"
26
+ }