RafatK commited on
Commit
d3ca54f
·
verified ·
1 Parent(s): e9ef96c

Upload processor

Browse files
Files changed (3) hide show
  1. processor_config.json +17 -0
  2. tokenizer.json +113 -0
  3. tokenizer_config.json +14 -0
processor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor": {
3
+ "chunk_length": 30,
4
+ "dither": 0.0,
5
+ "feature_extractor_type": "WhisperFeatureExtractor",
6
+ "feature_size": 80,
7
+ "hop_length": 160,
8
+ "n_fft": 400,
9
+ "n_samples": 480000,
10
+ "nb_max_frames": 3000,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ },
16
+ "processor_class": "WhisperProcessor"
17
+ }
tokenizer.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<|endoftext|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ }
15
+ ],
16
+ "normalizer": null,
17
+ "pre_tokenizer": {
18
+ "type": "ByteLevel",
19
+ "add_prefix_space": false,
20
+ "trim_offsets": true,
21
+ "use_regex": true
22
+ },
23
+ "post_processor": {
24
+ "type": "TemplateProcessing",
25
+ "single": [
26
+ {
27
+ "SpecialToken": {
28
+ "id": "<|endoftext|>",
29
+ "type_id": 0
30
+ }
31
+ },
32
+ {
33
+ "SpecialToken": {
34
+ "id": "<|endoftext|>",
35
+ "type_id": 0
36
+ }
37
+ },
38
+ {
39
+ "Sequence": {
40
+ "id": "A",
41
+ "type_id": 0
42
+ }
43
+ },
44
+ {
45
+ "SpecialToken": {
46
+ "id": "<|endoftext|>",
47
+ "type_id": 0
48
+ }
49
+ }
50
+ ],
51
+ "pair": [
52
+ {
53
+ "SpecialToken": {
54
+ "id": "<|endoftext|>",
55
+ "type_id": 0
56
+ }
57
+ },
58
+ {
59
+ "SpecialToken": {
60
+ "id": "<|endoftext|>",
61
+ "type_id": 0
62
+ }
63
+ },
64
+ {
65
+ "Sequence": {
66
+ "id": "A",
67
+ "type_id": 0
68
+ }
69
+ },
70
+ {
71
+ "Sequence": {
72
+ "id": "B",
73
+ "type_id": 1
74
+ }
75
+ },
76
+ {
77
+ "SpecialToken": {
78
+ "id": "<|endoftext|>",
79
+ "type_id": 1
80
+ }
81
+ }
82
+ ],
83
+ "special_tokens": {
84
+ "<|endoftext|>": {
85
+ "id": "<|endoftext|>",
86
+ "ids": [
87
+ 0
88
+ ],
89
+ "tokens": [
90
+ "<|endoftext|>"
91
+ ]
92
+ }
93
+ }
94
+ },
95
+ "decoder": {
96
+ "type": "ByteLevel",
97
+ "add_prefix_space": true,
98
+ "trim_offsets": true,
99
+ "use_regex": true
100
+ },
101
+ "model": {
102
+ "type": "BPE",
103
+ "dropout": null,
104
+ "unk_token": null,
105
+ "continuing_subword_prefix": "",
106
+ "end_of_word_suffix": "",
107
+ "fuse_unk": false,
108
+ "byte_fallback": false,
109
+ "ignore_merges": false,
110
+ "vocab": {},
111
+ "merges": []
112
+ }
113
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|endoftext|>",
5
+ "eos_token": "<|endoftext|>",
6
+ "is_local": true,
7
+ "language": null,
8
+ "model_max_length": 1000000000000000019884624838656,
9
+ "predict_timestamps": false,
10
+ "processor_class": "WhisperProcessor",
11
+ "task": null,
12
+ "tokenizer_class": "WhisperTokenizer",
13
+ "unk_token": "<|endoftext|>"
14
+ }