naleraphael commited on
Commit
876f9a6
1 Parent(s): 37ce1b9

Training in progress, step 500

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ checkpoint-*/
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 3029, "</s>": 3030}
config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-xls-r-300m",
3
+ "activation_dropout": 0.1,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 768,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.0,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.0,
57
+ "hidden_size": 1024,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 4096,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.0,
62
+ "mask_feature_length": 64,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.25,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.75,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 24,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1024,
79
+ "pad_token_id": 3028,
80
+ "proj_codevector_dim": 768,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.17.0.dev0",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 3031,
106
+ "xvector_output_dim": 512
107
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88b176f57552ec96e597aba50551da55604b80f1a0fd5bbc54f03565bb9239af
3
+ size 1274350833
run.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ python run_speech_recognition_ctc.py \
3
+ --dataset_name="mozilla-foundation/common_voice_7_0" \
4
+ --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
5
+ --dataset_config_name="zh-TW" \
6
+ --output_dir="./" \
7
+ --overwrite_output_dir \
8
+ --num_train_epochs="50" \
9
+ --per_device_train_batch_size="8" \
10
+ --per_device_eval_batch_size="8" \
11
+ --gradient_accumulation_steps="4" \
12
+ --learning_rate="7.5e-5" \
13
+ --warmup_steps="2000" \
14
+ --length_column_name="input_length" \
15
+ --evaluation_strategy="steps" \
16
+ --text_column_name="sentence" \
17
+ --chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
18
+ --save_steps="500" \
19
+ --eval_steps="500" \
20
+ --logging_steps="100" \
21
+ --layerdrop="0.0" \
22
+ --activation_dropout="0.1" \
23
+ --save_total_limit="3" \
24
+ --freeze_feature_encoder \
25
+ --feat_proj_dropout="0.0" \
26
+ --mask_time_prob="0.75" \
27
+ --mask_time_length="10" \
28
+ --mask_feature_prob="0.25" \
29
+ --mask_feature_length="64" \
30
+ --gradient_checkpointing \
31
+ --use_auth_token \
32
+ --fp16 \
33
+ --group_by_length \
34
+ --do_train --do_eval \
35
+ --push_to_hub
run_speech_recognition_ctc.py ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+
16
+ """ Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
17
+
18
+ import functools
19
+ import json
20
+ import logging
21
+ import os
22
+ import re
23
+ import sys
24
+ import warnings
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, List, Optional, Union
27
+
28
+ import datasets
29
+ import numpy as np
30
+ import torch
31
+ from datasets import DatasetDict, load_dataset, load_metric
32
+
33
+ import transformers
34
+ from transformers import (
35
+ AutoConfig,
36
+ AutoFeatureExtractor,
37
+ AutoModelForCTC,
38
+ AutoProcessor,
39
+ AutoTokenizer,
40
+ HfArgumentParser,
41
+ Trainer,
42
+ TrainingArguments,
43
+ Wav2Vec2Processor,
44
+ set_seed,
45
+ )
46
+ from transformers.trainer_utils import get_last_checkpoint, is_main_process
47
+ from transformers.utils import check_min_version
48
+ from transformers.utils.versions import require_version
49
+
50
+
51
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
52
+ check_min_version("4.17.0.dev0")
53
+
54
+ require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
55
+
56
+
57
+ logger = logging.getLogger(__name__)
58
+
59
+
60
+ def list_field(default=None, metadata=None):
61
+ return field(default_factory=lambda: default, metadata=metadata)
62
+
63
+
64
+ @dataclass
65
+ class ModelArguments:
66
+ """
67
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
68
+ """
69
+
70
+ model_name_or_path: str = field(
71
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
72
+ )
73
+ tokenizer_name_or_path: Optional[str] = field(
74
+ default=None,
75
+ metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
76
+ )
77
+ cache_dir: Optional[str] = field(
78
+ default=None,
79
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
80
+ )
81
+ freeze_feature_encoder: bool = field(
82
+ default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
83
+ )
84
+ attention_dropout: float = field(
85
+ default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
86
+ )
87
+ activation_dropout: float = field(
88
+ default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
89
+ )
90
+ feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
91
+ hidden_dropout: float = field(
92
+ default=0.0,
93
+ metadata={
94
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
95
+ },
96
+ )
97
+ final_dropout: float = field(
98
+ default=0.0,
99
+ metadata={"help": "The dropout probability for the final projection layer."},
100
+ )
101
+ mask_time_prob: float = field(
102
+ default=0.05,
103
+ metadata={
104
+ "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
105
+ "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
106
+ "vectors will be masked along the time axis."
107
+ },
108
+ )
109
+ mask_time_length: int = field(
110
+ default=10,
111
+ metadata={"help": "Length of vector span to mask along the time axis."},
112
+ )
113
+ mask_feature_prob: float = field(
114
+ default=0.0,
115
+ metadata={
116
+ "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
117
+ "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
118
+ },
119
+ )
120
+ mask_feature_length: int = field(
121
+ default=10,
122
+ metadata={"help": "Length of vector span to mask along the feature axis."},
123
+ )
124
+ layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
125
+ ctc_loss_reduction: Optional[str] = field(
126
+ default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
127
+ )
128
+
129
+
130
+ @dataclass
131
+ class DataTrainingArguments:
132
+ """
133
+ Arguments pertaining to what data we are going to input our model for training and eval.
134
+
135
+ Using `HfArgumentParser` we can turn this class
136
+ into argparse arguments to be able to specify them on
137
+ the command line.
138
+ """
139
+
140
+ dataset_name: str = field(
141
+ metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
142
+ )
143
+ dataset_config_name: str = field(
144
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
145
+ )
146
+ train_split_name: str = field(
147
+ default="train+validation",
148
+ metadata={
149
+ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train+validation'"
150
+ },
151
+ )
152
+ eval_split_name: str = field(
153
+ default="test",
154
+ metadata={
155
+ "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
156
+ },
157
+ )
158
+ audio_column_name: str = field(
159
+ default="audio",
160
+ metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
161
+ )
162
+ text_column_name: str = field(
163
+ default="text",
164
+ metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
165
+ )
166
+ overwrite_cache: bool = field(
167
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
168
+ )
169
+ preprocessing_num_workers: Optional[int] = field(
170
+ default=None,
171
+ metadata={"help": "The number of processes to use for the preprocessing."},
172
+ )
173
+ max_train_samples: Optional[int] = field(
174
+ default=None,
175
+ metadata={
176
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
177
+ "value if set."
178
+ },
179
+ )
180
+ max_eval_samples: Optional[int] = field(
181
+ default=None,
182
+ metadata={
183
+ "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
184
+ "value if set."
185
+ },
186
+ )
187
+ chars_to_ignore: Optional[List[str]] = list_field(
188
+ default=None,
189
+ metadata={"help": "A list of characters to remove from the transcripts."},
190
+ )
191
+ eval_metrics: List[str] = list_field(
192
+ default=["wer"],
193
+ metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
194
+ )
195
+ max_duration_in_seconds: float = field(
196
+ default=20.0,
197
+ metadata={
198
+ "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
199
+ },
200
+ )
201
+ min_duration_in_seconds: float = field(
202
+ default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
203
+ )
204
+ preprocessing_only: bool = field(
205
+ default=False,
206
+ metadata={
207
+ "help": "Whether to only do data preprocessing and skip training. "
208
+ "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
209
+ "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
210
+ "so that the cached datasets can consequently be loaded in distributed training"
211
+ },
212
+ )
213
+ use_auth_token: bool = field(
214
+ default=False,
215
+ metadata={
216
+ "help": "If :obj:`True`, will use the token generated when running"
217
+ ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
218
+ },
219
+ )
220
+ unk_token: str = field(
221
+ default="[UNK]",
222
+ metadata={"help": "The unk token for the tokenizer"},
223
+ )
224
+ pad_token: str = field(
225
+ default="[PAD]",
226
+ metadata={"help": "The padding token for the tokenizer"},
227
+ )
228
+ word_delimiter_token: str = field(
229
+ default="|",
230
+ metadata={"help": "The word delimiter token for the tokenizer"},
231
+ )
232
+ phoneme_language: Optional[str] = field(
233
+ default=None,
234
+ metadata={
235
+ "help": "The target language that should be used be"
236
+ " passed to the tokenizer for tokenization. Note that"
237
+ " this is only relevant if the model classifies the"
238
+ " input audio to a sequence of phoneme sequences."
239
+ },
240
+ )
241
+
242
+
243
+ @dataclass
244
+ class DataCollatorCTCWithPadding:
245
+ """
246
+ Data collator that will dynamically pad the inputs received.
247
+ Args:
248
+ processor (:class:`~transformers.AutoProcessor`)
249
+ The processor used for proccessing the data.
250
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
251
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
252
+ among:
253
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
254
+ sequence if provided).
255
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
256
+ maximum acceptable input length for the model if that argument is not provided.
257
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
258
+ different lengths).
259
+ max_length (:obj:`int`, `optional`):
260
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
261
+ max_length_labels (:obj:`int`, `optional`):
262
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
263
+ pad_to_multiple_of (:obj:`int`, `optional`):
264
+ If set will pad the sequence to a multiple of the provided value.
265
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
266
+ 7.5 (Volta).
267
+ """
268
+
269
+ processor: AutoProcessor
270
+ padding: Union[bool, str] = "longest"
271
+ pad_to_multiple_of: Optional[int] = None
272
+ pad_to_multiple_of_labels: Optional[int] = None
273
+
274
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
275
+ # split inputs and labels since they have to be of different lenghts and need
276
+ # different padding methods
277
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
278
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
279
+
280
+ batch = self.processor.pad(
281
+ input_features,
282
+ padding=self.padding,
283
+ pad_to_multiple_of=self.pad_to_multiple_of,
284
+ return_tensors="pt",
285
+ )
286
+
287
+ with self.processor.as_target_processor():
288
+ labels_batch = self.processor.pad(
289
+ label_features,
290
+ padding=self.padding,
291
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
292
+ return_tensors="pt",
293
+ )
294
+
295
+ # replace padding with -100 to ignore loss correctly
296
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
297
+
298
+ batch["labels"] = labels
299
+
300
+ return batch
301
+
302
+
303
+ def create_vocabulary_from_data(
304
+ datasets: DatasetDict,
305
+ word_delimiter_token: Optional[str] = None,
306
+ unk_token: Optional[str] = None,
307
+ pad_token: Optional[str] = None,
308
+ ):
309
+ # Given training and test labels create vocabulary
310
+ def extract_all_chars(batch):
311
+ all_text = " ".join(batch["target_text"])
312
+ vocab = list(set(all_text))
313
+ return {"vocab": [vocab], "all_text": [all_text]}
314
+
315
+ vocabs = datasets.map(
316
+ extract_all_chars,
317
+ batched=True,
318
+ batch_size=-1,
319
+ keep_in_memory=True,
320
+ remove_columns=datasets["train"].column_names,
321
+ )
322
+
323
+ # take union of all unique characters in each dataset
324
+ vocab_set = functools.reduce(
325
+ lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
326
+ )
327
+
328
+ vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
329
+
330
+ # replace white space with delimiter token
331
+ if word_delimiter_token is not None:
332
+ vocab_dict[word_delimiter_token] = vocab_dict[" "]
333
+ del vocab_dict[" "]
334
+
335
+ # add unk and pad token
336
+ if unk_token is not None:
337
+ vocab_dict[unk_token] = len(vocab_dict)
338
+
339
+ if pad_token is not None:
340
+ vocab_dict[pad_token] = len(vocab_dict)
341
+
342
+ return vocab_dict
343
+
344
+
345
+ def main():
346
+ # See all possible arguments in src/transformers/training_args.py
347
+ # or by passing the --help flag to this script.
348
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
349
+
350
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
351
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
352
+ # If we pass only one argument to the script and it's the path to a json file,
353
+ # let's parse it to get our arguments.
354
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
355
+ else:
356
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
357
+
358
+ # Detecting last checkpoint.
359
+ last_checkpoint = None
360
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
361
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
362
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
363
+ raise ValueError(
364
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
365
+ "Use --overwrite_output_dir to overcome."
366
+ )
367
+ elif last_checkpoint is not None:
368
+ logger.info(
369
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
370
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
371
+ )
372
+
373
+ # Setup logging
374
+ logging.basicConfig(
375
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
376
+ datefmt="%m/%d/%Y %H:%M:%S",
377
+ handlers=[logging.StreamHandler(sys.stdout)],
378
+ )
379
+ logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
380
+
381
+ # Log on each process the small summary:
382
+ logger.warning(
383
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
384
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
385
+ )
386
+ # Set the verbosity to info of the Transformers logger (on main process only):
387
+ if is_main_process(training_args.local_rank):
388
+ transformers.utils.logging.set_verbosity_info()
389
+ logger.info("Training/evaluation parameters %s", training_args)
390
+
391
+ # Set seed before initializing model.
392
+ set_seed(training_args.seed)
393
+
394
+ # 1. First, let's load the dataset
395
+ raw_datasets = DatasetDict()
396
+
397
+ if training_args.do_train:
398
+ raw_datasets["train"] = load_dataset(
399
+ data_args.dataset_name,
400
+ data_args.dataset_config_name,
401
+ split=data_args.train_split_name,
402
+ use_auth_token=data_args.use_auth_token,
403
+ )
404
+
405
+ if data_args.audio_column_name not in raw_datasets["train"].column_names:
406
+ raise ValueError(
407
+ f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
408
+ "Make sure to set `--audio_column_name` to the correct audio column - one of "
409
+ f"{', '.join(raw_datasets['train'].column_names)}."
410
+ )
411
+
412
+ if data_args.text_column_name not in raw_datasets["train"].column_names:
413
+ raise ValueError(
414
+ f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
415
+ "Make sure to set `--text_column_name` to the correct text column - one of "
416
+ f"{', '.join(raw_datasets['train'].column_names)}."
417
+ )
418
+
419
+ if data_args.max_train_samples is not None:
420
+ raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
421
+
422
+ if training_args.do_eval:
423
+ raw_datasets["eval"] = load_dataset(
424
+ data_args.dataset_name,
425
+ data_args.dataset_config_name,
426
+ split=data_args.eval_split_name,
427
+ use_auth_token=data_args.use_auth_token,
428
+ )
429
+
430
+ if data_args.max_eval_samples is not None:
431
+ raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
432
+
433
+ # 2. We remove some special characters from the datasets
434
+ # that make training complicated and do not help in transcribing the speech
435
+ # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
+ # that could be easily picked up by the model
437
+ chars_to_ignore_regex = (
438
+ f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
439
+ )
440
+ text_column_name = data_args.text_column_name
441
+
442
+ def remove_special_characters(batch):
443
+ if chars_to_ignore_regex is not None:
444
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
445
+ else:
446
+ batch["target_text"] = batch[text_column_name].lower() + " "
447
+ return batch
448
+
449
+ with training_args.main_process_first(desc="dataset map special characters removal"):
450
+ raw_datasets = raw_datasets.map(
451
+ remove_special_characters,
452
+ remove_columns=[text_column_name],
453
+ desc="remove special characters from datasets",
454
+ )
455
+
456
+ # save special tokens for tokenizer
457
+ word_delimiter_token = data_args.word_delimiter_token
458
+ unk_token = data_args.unk_token
459
+ pad_token = data_args.pad_token
460
+
461
+ # 3. Next, let's load the config as we might need it to create
462
+ # the tokenizer
463
+ # load config
464
+ config = AutoConfig.from_pretrained(
465
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
466
+ )
467
+
468
+ # 4. Next, if no tokenizer file is defined,
469
+ # we create the vocabulary of the model by extracting all unique characters from
470
+ # the training and evaluation datasets
471
+ # We need to make sure that only first rank saves vocabulary
472
+ # make sure all processes wait until vocab is created
473
+ tokenizer_name_or_path = model_args.tokenizer_name_or_path
474
+ tokenizer_kwargs = {}
475
+ if tokenizer_name_or_path is None:
476
+ # save vocab in training output dir
477
+ tokenizer_name_or_path = training_args.output_dir
478
+
479
+ vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
480
+
481
+ with training_args.main_process_first():
482
+ if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
483
+ os.remove(vocab_file)
484
+
485
+ with training_args.main_process_first(desc="dataset map vocabulary creation"):
486
+ if not os.path.isfile(vocab_file):
487
+ os.makedirs(tokenizer_name_or_path, exist_ok=True)
488
+ vocab_dict = create_vocabulary_from_data(
489
+ raw_datasets,
490
+ word_delimiter_token=word_delimiter_token,
491
+ unk_token=unk_token,
492
+ pad_token=pad_token,
493
+ )
494
+
495
+ # save vocab dict to be loaded into tokenizer
496
+ with open(vocab_file, "w") as file:
497
+ json.dump(vocab_dict, file)
498
+
499
+ # if tokenizer has just been created
500
+ # it is defined by `tokenizer_class` if present in config else by `model_type`
501
+ tokenizer_kwargs = {
502
+ "config": config if config.tokenizer_class is not None else None,
503
+ "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
504
+ "unk_token": unk_token,
505
+ "pad_token": pad_token,
506
+ "word_delimiter_token": word_delimiter_token,
507
+ }
508
+
509
+ # 5. Now we can instantiate the feature extractor, tokenizer and model
510
+ # Note for distributed training, the .from_pretrained methods guarantee that only
511
+ # one local process can concurrently download model & vocab.
512
+
513
+ # load feature_extractor and tokenizer
514
+ tokenizer = AutoTokenizer.from_pretrained(
515
+ tokenizer_name_or_path,
516
+ use_auth_token=data_args.use_auth_token,
517
+ **tokenizer_kwargs,
518
+ )
519
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
520
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
521
+ )
522
+
523
+ # adapt config
524
+ config.update(
525
+ {
526
+ "feat_proj_dropout": model_args.feat_proj_dropout,
527
+ "attention_dropout": model_args.attention_dropout,
528
+ "hidden_dropout": model_args.hidden_dropout,
529
+ "final_dropout": model_args.final_dropout,
530
+ "mask_time_prob": model_args.mask_time_prob,
531
+ "mask_time_length": model_args.mask_time_length,
532
+ "mask_feature_prob": model_args.mask_feature_prob,
533
+ "mask_feature_length": model_args.mask_feature_length,
534
+ "gradient_checkpointing": training_args.gradient_checkpointing,
535
+ "layerdrop": model_args.layerdrop,
536
+ "ctc_loss_reduction": model_args.ctc_loss_reduction,
537
+ "pad_token_id": tokenizer.pad_token_id,
538
+ "vocab_size": len(tokenizer),
539
+ "activation_dropout": model_args.activation_dropout,
540
+ }
541
+ )
542
+
543
+ # create model
544
+ model = AutoModelForCTC.from_pretrained(
545
+ model_args.model_name_or_path,
546
+ cache_dir=model_args.cache_dir,
547
+ config=config,
548
+ use_auth_token=data_args.use_auth_token,
549
+ )
550
+
551
+ # freeze encoder
552
+ if model_args.freeze_feature_encoder:
553
+ model.freeze_feature_encoder()
554
+
555
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
556
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
557
+ # so that we just need to set the correct target sampling rate and normalize the input
558
+ # via the `feature_extractor`
559
+
560
+ # make sure that dataset decodes audio with correct sampling rate
561
+ dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
562
+ if dataset_sampling_rate != feature_extractor.sampling_rate:
563
+ raw_datasets = raw_datasets.cast_column(
564
+ data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
565
+ )
566
+
567
+ # derive max & min input length for sample rate & max duration
568
+ max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
569
+ min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
570
+ audio_column_name = data_args.audio_column_name
571
+ num_workers = data_args.preprocessing_num_workers
572
+
573
+ # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
574
+ phoneme_language = data_args.phoneme_language
575
+
576
+ # Preprocessing the datasets.
577
+ # We need to read the audio files as arrays and tokenize the targets.
578
+ def prepare_dataset(batch):
579
+ # load audio
580
+ sample = batch[audio_column_name]
581
+
582
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
583
+ batch["input_values"] = inputs.input_values[0]
584
+ batch["input_length"] = len(batch["input_values"])
585
+
586
+ # encode targets
587
+ additional_kwargs = {}
588
+ if phoneme_language is not None:
589
+ additional_kwargs["phonemizer_lang"] = phoneme_language
590
+
591
+ batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
592
+ return batch
593
+
594
+ with training_args.main_process_first(desc="dataset map preprocessing"):
595
+ vectorized_datasets = raw_datasets.map(
596
+ prepare_dataset,
597
+ remove_columns=next(iter(raw_datasets.values())).column_names,
598
+ num_proc=num_workers,
599
+ desc="preprocess datasets",
600
+ )
601
+
602
+ def is_audio_in_length_range(length):
603
+ return length > min_input_length and length < max_input_length
604
+
605
+ # filter data that is shorter than min_input_length
606
+ vectorized_datasets = vectorized_datasets.filter(
607
+ is_audio_in_length_range,
608
+ num_proc=num_workers,
609
+ input_columns=["input_length"],
610
+ )
611
+
612
+ # 7. Next, we can prepare the training.
613
+ # Let's use word error rate (WER) as our evaluation metric,
614
+ # instantiate a data collator and the trainer
615
+
616
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
617
+ eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
618
+
619
+ # for large datasets it is advised to run the preprocessing on a
620
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
621
+ # be a timeout when running the script in distributed mode.
622
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
623
+ # cached dataset
624
+ if data_args.preprocessing_only:
625
+ logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
626
+ return
627
+
628
+ def compute_metrics(pred):
629
+ pred_logits = pred.predictions
630
+ pred_ids = np.argmax(pred_logits, axis=-1)
631
+
632
+ pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
633
+
634
+ pred_str = tokenizer.batch_decode(pred_ids)
635
+ # we do not want to group tokens when computing the metrics
636
+ label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
637
+
638
+ metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
639
+
640
+ return metrics
641
+
642
+ # Now save everything to be able to create a single processor later
643
+ if is_main_process(training_args.local_rank):
644
+ # save feature extractor, tokenizer and config
645
+ feature_extractor.save_pretrained(training_args.output_dir)
646
+ tokenizer.save_pretrained(training_args.output_dir)
647
+ config.save_pretrained(training_args.output_dir)
648
+
649
+ try:
650
+ processor = AutoProcessor.from_pretrained(training_args.output_dir)
651
+ except (OSError, KeyError):
652
+ warnings.warn(
653
+ "Loading a processor from a feature extractor config that does not"
654
+ " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
655
+ " attribute to your `preprocessor_config.json` file to suppress this warning: "
656
+ " `'processor_class': 'Wav2Vec2Processor'`",
657
+ FutureWarning,
658
+ )
659
+ processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
660
+
661
+ # Instantiate custom data collator
662
+ data_collator = DataCollatorCTCWithPadding(processor=processor)
663
+
664
+ # Initialize Trainer
665
+ trainer = Trainer(
666
+ model=model,
667
+ data_collator=data_collator,
668
+ args=training_args,
669
+ compute_metrics=compute_metrics,
670
+ train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
671
+ eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
672
+ tokenizer=feature_extractor,
673
+ )
674
+
675
+ # 8. Finally, we can start training
676
+
677
+ # Training
678
+ if training_args.do_train:
679
+
680
+ # use last checkpoint if exist
681
+ if last_checkpoint is not None:
682
+ checkpoint = last_checkpoint
683
+ elif os.path.isdir(model_args.model_name_or_path):
684
+ checkpoint = model_args.model_name_or_path
685
+ else:
686
+ checkpoint = None
687
+
688
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
689
+ trainer.save_model()
690
+
691
+ metrics = train_result.metrics
692
+ max_train_samples = (
693
+ data_args.max_train_samples
694
+ if data_args.max_train_samples is not None
695
+ else len(vectorized_datasets["train"])
696
+ )
697
+ metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
698
+
699
+ trainer.log_metrics("train", metrics)
700
+ trainer.save_metrics("train", metrics)
701
+ trainer.save_state()
702
+
703
+ # Evaluation
704
+ results = {}
705
+ if training_args.do_eval:
706
+ logger.info("*** Evaluate ***")
707
+ metrics = trainer.evaluate()
708
+ max_eval_samples = (
709
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
710
+ )
711
+ metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
712
+
713
+ trainer.log_metrics("eval", metrics)
714
+ trainer.save_metrics("eval", metrics)
715
+
716
+ # Write model card and (optionally) push to hub
717
+ config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
718
+ kwargs = {
719
+ "finetuned_from": model_args.model_name_or_path,
720
+ "tasks": "speech-recognition",
721
+ "tags": ["automatic-speech-recognition", data_args.dataset_name],
722
+ "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
723
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
724
+ }
725
+ if "common_voice" in data_args.dataset_name:
726
+ kwargs["language"] = config_name
727
+
728
+ if training_args.push_to_hub:
729
+ trainer.push_to_hub(**kwargs)
730
+ else:
731
+ trainer.create_model_card(**kwargs)
732
+
733
+ return results
734
+
735
+
736
+ if __name__ == "__main__":
737
+ main()
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36a9163393fb6b85ab9754dc500a8171106095c789cc5d76758339e8671f5b54
3
+ size 2991
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"a": 1, "d": 2, "e": 3, "g": 4, "i": 5, "l": 6, "o": 7, "p": 8, "q": 9, "⋯": 10, "⼤": 11, "⽣": 12, "、": 13, "。": 14, "《": 15, "》": 16, "「": 17, "」": 18, "ㄟ": 19, "ㄧ": 20, "一": 21, "丁": 22, "七": 23, "丈": 24, "三": 25, "上": 26, "下": 27, "不": 28, "且": 29, "世": 30, "丘": 31, "丙": 32, "丟": 33, "並": 34, "中": 35, "串": 36, "丶": 37, "丸": 38, "丹": 39, "主": 40, "乃": 41, "久": 42, "之": 43, "乍": 44, "乎": 45, "乏": 46, "乖": 47, "乘": 48, "乙": 49, "九": 50, "乞": 51, "也": 52, "乳": 53, "乾": 54, "亂": 55, "了": 56, "予": 57, "事": 58, "二": 59, "互": 60, "五": 61, "井": 62, "些": 63, "亞": 64, "亡": 65, "交": 66, "亥": 67, "亦": 68, "享": 69, "京": 70, "亭": 71, "亮": 72, "人": 73, "什": 74, "仁": 75, "仆": 76, "仇": 77, "今": 78, "介": 79, "仍": 80, "仔": 81, "他": 82, "付": 83, "仙": 84, "代": 85, "令": 86, "以": 87, "仰": 88, "仲": 89, "件": 90, "任": 91, "份": 92, "仿": 93, "企": 94, "伊": 95, "伍": 96, "伏": 97, "伐": 98, "休": 99, "伙": 100, "伯": 101, "估": 102, "伴": 103, "伸": 104, "伺": 105, "似": 106, "佃": 107, "但": 108, "佈": 109, "位": 110, "低": 111, "住": 112, "佑": 113, "佔": 114, "何": 115, "佛": 116, "作": 117, "你": 118, "佩": 119, "佰": 120, "佳": 121, "併": 122, "使": 123, "來": 124, "侈": 125, "例": 126, "供": 127, "依": 128, "侮": 129, "侯": 130, "侵": 131, "侶": 132, "便": 133, "係": 134, "促": 135, "俄": 136, "俊": 137, "俏": 138, "俗": 139, "保": 140, "俠": 141, "信": 142, "修": 143, "俱": 144, "俾": 145, "倉": 146, "個": 147, "倍": 148, "們": 149, "倒": 150, "倖": 151, "倘": 152, "候": 153, "倚": 154, "借": 155, "倡": 156, "値": 157, "倦": 158, "倫": 159, "值": 160, "假": 161, "偉": 162, "偏": 163, "做": 164, "停": 165, "健": 166, "側": 167, "偵": 168, "偶": 169, "偷": 170, "偽": 171, "傅": 172, "傍": 173, "傑": 174, "傘": 175, "備": 176, "傢": 177, "催": 178, "傲": 179, "傳": 180, "債": 181, "傷": 182, "傻": 183, "傾": 184, "僅": 185, "像": 186, "僕": 187, "僚": 188, "僱": 189, "僵": 190, "價": 191, "僻": 192, "儀": 193, "儂": 194, "億": 195, "儘": 196, "償": 197, "優": 198, "儲": 199, "兀": 200, "允": 201, "元": 202, "兄": 203, "充": 204, "兇": 205, "先": 206, "光": 207, "克": 208, "兌": 209, "免": 210, "兒": 211, "兔": 212, "入": 213, "內": 214, "全": 215, "兩": 216, "八": 217, "公": 218, "六": 219, "兮": 220, "共": 221, "兵": 222, "其": 223, "具": 224, "典": 225, "兼": 226, "内": 227, "冊": 228, "再": 229, "冒": 230, "冗": 231, "冠": 232, "冤": 233, "冬": 234, "冰": 235, "冶": 236, "冷": 237, "准": 238, "凋": 239, "凌": 240, "凍": 241, "凝": 242, "凡": 243, "凱": 244, "凳": 245, "凹": 246, "出": 247, "函": 248, "刀": 249, "刁": 250, "分": 251, "切": 252, "刊": 253, "刑": 254, "划": 255, "列": 256, "初": 257, "判": 258, "別": 259, "利": 260, "刪": 261, "到": 262, "制": 263, "刷": 264, "券": 265, "刺": 266, "刻": 267, "則": 268, "前": 269, "剖": 270, "剛": 271, "剝": 272, "剩": 273, "剪": 274, "副": 275, "割": 276, "創": 277, "剷": 278, "劃": 279, "劇": 280, "劈": 281, "劉": 282, "劍": 283, "劑": 284, "力": 285, "功": 286, "加": 287, "劣": 288, "助": 289, "努": 290, "勁": 291, "勇": 292, "勉": 293, "勒": 294, "動": 295, "勘": 296, "務": 297, "勝": 298, "勞": 299, "募": 300, "勢": 301, "勤": 302, "勳": 303, "勵": 304, "勸": 305, "勻": 306, "勾": 307, "勿": 308, "包": 309, "匆": 310, "匈": 311, "化": 312, "北": 313, "匙": 314, "匪": 315, "匯": 316, "匱": 317, "匹": 318, "匿": 319, "區": 320, "十": 321, "千": 322, "升": 323, "午": 324, "半": 325, "卑": 326, "卓": 327, "協": 328, "南": 329, "博": 330, "占": 331, "卡": 332, "卦": 333, "印": 334, "危": 335, "即": 336, "卵": 337, "卷": 338, "卸": 339, "卻": 340, "厄": 341, "厚": 342, "原": 343, "厭": 344, "厲": 345, "去": 346, "參": 347, "又": 348, "叉": 349, "及": 350, "友": 351, "反": 352, "叔": 353, "取": 354, "受": 355, "叛": 356, "叢": 357, "口": 358, "古": 359, "句": 360, "另": 361, "叨": 362, "只": 363, "叫": 364, "召": 365, "叮": 366, "可": 367, "台": 368, "史": 369, "右": 370, "司": 371, "叼": 372, "吃": 373, "各": 374, "合": 375, "吉": 376, "吊": 377, "同": 378, "名": 379, "吐": 380, "向": 381, "君": 382, "吝": 383, "吞": 384, "吠": 385, "否": 386, "吧": 387, "含": 388, "吭": 389, "吳": 390, "吵": 391, "吸": 392, "吹": 393, "吻": 394, "吼": 395, "呀": 396, "呂": 397, "呆": 398, "呈": 399, "告": 400, "呢": 401, "周": 402, "呱": 403, "味": 404, "呵": 405, "呼": 406, "命": 407, "咆": 408, "和": 409, "咒": 410, "咕": 411, "咖": 412, "咚": 413, "咪": 414, "咬": 415, "咳": 416, "哀": 417, "品": 418, "哄": 419, "哇": 420, "哈": 421, "哉": 422, "哎": 423, "員": 424, "哥": 425, "哦": 426, "哨": 427, "哩": 428, "哪": 429, "哭": 430, "哲": 431, "哺": 432, "哼": 433, "唆": 434, "唐": 435, "唬": 436, "售": 437, "唯": 438, "唱": 439, "唷": 440, "唸": 441, "商": 442, "啊": 443, "問": 444, "啖": 445, "啟": 446, "啡": 447, "啤": 448, "啥": 449, "啦": 450, "啼": 451, "啾": 452, "喀": 453, "善": 454, "喉": 455, "喊": 456, "喔": 457, "喘": 458, "喜": 459, "喝": 460, "喧": 461, "喪": 462, "喬": 463, "單": 464, "喵": 465, "喻": 466, "嗆": 467, "嗎": 468, "嗚": 469, "嗜": 470, "嗡": 471, "嗨": 472, "嗯": 473, "嗽": 474, "嘆": 475, "嘉": 476, "嘍": 477, "嘗": 478, "嘛": 479, "嘯": 480, "嘰": 481, "嘲": 482, "嘴": 483, "嘻": 484, "噌": 485, "噓": 486, "噗": 487, "器": 488, "噬": 489, "噴": 490, "噶": 491, "噸": 492, "噹": 493, "嚀": 494, "嚇": 495, "嚕": 496, "嚨": 497, "嚮": 498, "嚴": 499, "嚼": 500, "囂": 501, "囉": 502, "囊": 503, "囚": 504, "四": 505, "回": 506, "因": 507, "囤": 508, "困": 509, "固": 510, "圈": 511, "國": 512, "圍": 513, "園": 514, "圓": 515, "圖": 516, "團": 517, "土": 518, "在": 519, "地": 520, "圾": 521, "址": 522, "均": 523, "坊": 524, "坎": 525, "坐": 526, "坑": 527, "坡": 528, "坦": 529, "坪": 530, "垂": 531, "垃": 532, "型": 533, "垢": 534, "埋": 535, "城": 536, "域": 537, "執": 538, "培": 539, "基": 540, "堂": 541, "堃": 542, "堅": 543, "堆": 544, "堡": 545, "堪": 546, "報": 547, "場": 548, "堵": 549, "塊": 550, "塑": 551, "塔": 552, "塗": 553, "塞": 554, "填": 555, "塭": 556, "塵": 557, "境": 558, "墓": 559, "增": 560, "墨": 561, "墮": 562, "墾": 563, "壁": 564, "壅": 565, "壇": 566, "壓": 567, "壘": 568, "壞": 569, "壟": 570, "壤": 571, "士": 572, "壯": 573, "壺": 574, "壽": 575, "夏": 576, "夕": 577, "外": 578, "多": 579, "夜": 580, "夠": 581, "夢": 582, "夥": 583, "大": 584, "天": 585, "太": 586, "夫": 587, "夭": 588, "央": 589, "失": 590, "夷": 591, "夾": 592, "奇": 593, "奈": 594, "奉": 595, "奏": 596, "契": 597, "奔": 598, "套": 599, "奠": 600, "奢": 601, "奧": 602, "奪": 603, "奮": 604, "女": 605, "奴": 606, "奶": 607, "奸": 608, "她": 609, "好": 610, "如": 611, "妃": 612, "妄": 613, "妊": 614, "妓": 615, "妖": 616, "妙": 617, "妝": 618, "妥": 619, "妨": 620, "妳": 621, "妹": 622, "妻": 623, "妾": 624, "姆": 625, "姊": 626, "始": 627, "姍": 628, "姐": 629, "姑": 630, "姓": 631, "委": 632, "姚": 633, "姦": 634, "姨": 635, "姬": 636, "姻": 637, "威": 638, "娑": 639, "娘": 640, "娛": 641, "娠": 642, "娥": 643, "娶": 644, "娼": 645, "婆": 646, "婉": 647, "婊": 648, "婚": 649, "婦": 650, "婪": 651, "媒": 652, "媳": 653, "媽": 654, "嫁": 655, "嫂": 656, "嫌": 657, "嫖": 658, "嫩": 659, "嬤": 660, "嬰": 661, "嬸": 662, "子": 663, "孔": 664, "孕": 665, "字": 666, "存": 667, "孝": 668, "季": 669, "孤": 670, "孩": 671, "孫": 672, "孵": 673, "學": 674, "它": 675, "宅": 676, "宇": 677, "守": 678, "安": 679, "宋": 680, "完": 681, "宗": 682, "官": 683, "宙": 684, "定": 685, "宛": 686, "宜": 687, "客": 688, "宣": 689, "室": 690, "宮": 691, "宰": 692, "害": 693, "宴": 694, "宵": 695, "家": 696, "容": 697, "宿": 698, "寂": 699, "寄": 700, "密": 701, "富": 702, "寒": 703, "寓": 704, "寞": 705, "察": 706, "寡": 707, "實": 708, "寧": 709, "寨": 710, "審": 711, "寫": 712, "寬": 713, "寮": 714, "寵": 715, "寶": 716, "寸": 717, "寺": 718, "封": 719, "射": 720, "將": 721, "專": 722, "尊": 723, "尋": 724, "對": 725, "導": 726, "小": 727, "少": 728, "尖": 729, "尚": 730, "尤": 731, "尬": 732, "就": 733, "尷": 734, "尺": 735, "尼": 736, "尾": 737, "尿": 738, "局": 739, "屁": 740, "居": 741, "屆": 742, "屈": 743, "屋": 744, "屌": 745, "屍": 746, "屎": 747, "屏": 748, "屑": 749, "展": 750, "屜": 751, "屠": 752, "屢": 753, "層": 754, "履": 755, "屬": 756, "屯": 757, "山": 758, "岩": 759, "岸": 760, "峰": 761, "島": 762, "峻": 763, "峽": 764, "崇": 765, "崛": 766, "崩": 767, "崴": 768, "嵌": 769, "嵐": 770, "嶄": 771, "嶼": 772, "巔": 773, "川": 774, "州": 775, "巡": 776, "巢": 777, "工": 778, "左": 779, "巧": 780, "巨": 781, "差": 782, "己": 783, "已": 784, "巴": 785, "巷": 786, "巾": 787, "市": 788, "布": 789, "帆": 790, "希": 791, "帕": 792, "帖": 793, "帝": 794, "帥": 795, "師": 796, "席": 797, "帳": 798, "帶": 799, "常": 800, "帽": 801, "幅": 802, "幌": 803, "幕": 804, "幣": 805, "幫": 806, "干": 807, "平": 808, "年": 809, "幸": 810, "幹": 811, "幻": 812, "幼": 813, "幾": 814, "庄": 815, "床": 816, "序": 817, "底": 818, "店": 819, "府": 820, "度": 821, "座": 822, "庫": 823, "庭": 824, "康": 825, "庸": 826, "廁": 827, "廂": 828, "廉": 829, "廊": 830, "廖": 831, "廚": 832, "廟": 833, "廠": 834, "廢": 835, "廣": 836, "廬": 837, "廳": 838, "延": 839, "廷": 840, "建": 841, "弄": 842, "弊": 843, "式": 844, "弓": 845, "弔": 846, "引": 847, "弘": 848, "弟": 849, "弦": 850, "弱": 851, "張": 852, "強": 853, "彈": 854, "彊": 855, "彌": 856, "彎": 857, "彙": 858, "形": 859, "彩": 860, "彬": 861, "彭": 862, "影": 863, "彷": 864, "役": 865, "彼": 866, "彿": 867, "往": 868, "征": 869, "待": 870, "很": 871, "徊": 872, "律": 873, "後": 874, "徑": 875, "徒": 876, "得": 877, "徘": 878, "徙": 879, "從": 880, "復": 881, "循": 882, "徬": 883, "微": 884, "徵": 885, "德": 886, "徹": 887, "心": 888, "必": 889, "忌": 890, "忍": 891, "志": 892, "忘": 893, "忙": 894, "快": 895, "念": 896, "忽": 897, "怎": 898, "怒": 899, "怕": 900, "怖": 901, "思": 902, "怠": 903, "怡": 904, "急": 905, "怦": 906, "性": 907, "怨": 908, "怪": 909, "怵": 910, "恃": 911, "恆": 912, "恍": 913, "恐": 914, "恕": 915, "恢": 916, "恤": 917, "恨": 918, "恩": 919, "恭": 920, "息": 921, "恰": 922, "悄": 923, "悅": 924, "悉": 925, "悔": 926, "悟": 927, "悠": 928, "患": 929, "您": 930, "悲": 931, "情": 932, "惇": 933, "惋": 934, "惑": 935, "惕": 936, "惘": 937, "惜": 938, "惠": 939, "惡": 940, "惰": 941, "惱": 942, "想": 943, "惶": 944, "惹": 945, "愁": 946, "愈": 947, "愉": 948, "意": 949, "愕": 950, "愚": 951, "愛": 952, "愜": 953, "感": 954, "愣": 955, "愧": 956, "慈": 957, "態": 958, "慌": 959, "慎": 960, "慕": 961, "慘": 962, "慚": 963, "慢": 964, "慣": 965, "慧": 966, "慮": 967, "慰": 968, "慶": 969, "慾": 970, "憂": 971, "憊": 972, "憐": 973, "憑": 974, "憤": 975, "憩": 976, "憲": 977, "憶": 978, "憾": 979, "懂": 980, "懇": 981, "應": 982, "懲": 983, "懶": 984, "懷": 985, "懸": 986, "懼": 987, "戀": 988, "戊": 989, "成": 990, "我": 991, "戒": 992, "或": 993, "戚": 994, "截": 995, "戰": 996, "戲": 997, "戴": 998, "戶": 999, "房": 1000, "所": 1001, "扇": 1002, "扉": 1003, "手": 1004, "才": 1005, "扎": 1006, "打": 1007, "托": 1008, "扛": 1009, "扣": 1010, "扭": 1011, "扮": 1012, "扯": 1013, "扶": 1014, "批": 1015, "扼": 1016, "找": 1017, "承": 1018, "技": 1019, "抄": 1020, "把": 1021, "抑": 1022, "抒": 1023, "抓": 1024, "投": 1025, "抖": 1026, "抗": 1027, "折": 1028, "披": 1029, "抬": 1030, "抱": 1031, "抵": 1032, "抹": 1033, "抽": 1034, "拆": 1035, "拉": 1036, "拋": 1037, "拍": 1038, "拒": 1039, "拓": 1040, "拔": 1041, "拖": 1042, "拘": 1043, "拚": 1044, "招": 1045, "拜": 1046, "括": 1047, "拯": 1048, "拱": 1049, "拳": 1050, "拼": 1051, "拾": 1052, "拿": 1053, "持": 1054, "指": 1055, "按": 1056, "挑": 1057, "挖": 1058, "挫": 1059, "振": 1060, "挺": 1061, "捍": 1062, "捐": 1063, "捕": 1064, "捨": 1065, "捲": 1066, "捷": 1067, "掃": 1068, "授": 1069, "掉": 1070, "掌": 1071, "掏": 1072, "排": 1073, "掘": 1074, "掙": 1075, "掛": 1076, "掠": 1077, "採": 1078, "探": 1079, "接": 1080, "控": 1081, "推": 1082, "掩": 1083, "措": 1084, "掰": 1085, "揀": 1086, "揉": 1087, "描": 1088, "提": 1089, "插": 1090, "揚": 1091, "換": 1092, "握": 1093, "揣": 1094, "揪": 1095, "揭": 1096, "揮": 1097, "援": 1098, "揹": 1099, "損": 1100, "搏": 1101, "搔": 1102, "搖": 1103, "搗": 1104, "搜": 1105, "搞": 1106, "搬": 1107, "搭": 1108, "搶": 1109, "摔": 1110, "摘": 1111, "摟": 1112, "摧": 1113, "摩": 1114, "摯": 1115, "摸": 1116, "撇": 1117, "撐": 1118, "撒": 1119, "撕": 1120, "撞": 1121, "撤": 1122, "撥": 1123, "撫": 1124, "播": 1125, "撰": 1126, "撲": 1127, "撿": 1128, "擁": 1129, "擅": 1130, "擇": 1131, "擊": 1132, "擋": 1133, "操": 1134, "擎": 1135, "擔": 1136, "據": 1137, "擠": 1138, "擦": 1139, "擬": 1140, "擱": 1141, "擲": 1142, "擴": 1143, "擷": 1144, "擺": 1145, "擾": 1146, "攀": 1147, "攔": 1148, "攜": 1149, "攝": 1150, "攤": 1151, "攪": 1152, "支": 1153, "收": 1154, "攸": 1155, "改": 1156, "攻": 1157, "放": 1158, "政": 1159, "故": 1160, "效": 1161, "敏": 1162, "救": 1163, "敗": 1164, "敘": 1165, "教": 1166, "敝": 1167, "敢": 1168, "散": 1169, "敦": 1170, "敬": 1171, "敲": 1172, "整": 1173, "敵": 1174, "數": 1175, "斂": 1176, "文": 1177, "斌": 1178, "斑": 1179, "斗": 1180, "料": 1181, "斜": 1182, "斤": 1183, "斥": 1184, "斧": 1185, "斯": 1186, "新": 1187, "斷": 1188, "方": 1189, "於": 1190, "施": 1191, "旁": 1192, "旅": 1193, "旋": 1194, "族": 1195, "旗": 1196, "既": 1197, "日": 1198, "旦": 1199, "旨": 1200, "早": 1201, "旬": 1202, "旭": 1203, "旺": 1204, "昂": 1205, "昆": 1206, "昇": 1207, "昌": 1208, "明": 1209, "昏": 1210, "易": 1211, "昔": 1212, "星": 1213, "映": 1214, "春": 1215, "昧": 1216, "昨": 1217, "是": 1218, "昴": 1219, "時": 1220, "晃": 1221, "晉": 1222, "晚": 1223, "晨": 1224, "普": 1225, "景": 1226, "晰": 1227, "晴": 1228, "晶": 1229, "智": 1230, "晾": 1231, "暄": 1232, "暇": 1233, "暈": 1234, "暑": 1235, "暖": 1236, "暗": 1237, "暫": 1238, "暱": 1239, "暴": 1240, "曆": 1241, "曉": 1242, "曙": 1243, "曝": 1244, "曬": 1245, "曲": 1246, "更": 1247, "書": 1248, "曹": 1249, "曼": 1250, "曾": 1251, "替": 1252, "最": 1253, "會": 1254, "月": 1255, "有": 1256, "朋": 1257, "服": 1258, "朕": 1259, "朗": 1260, "望": 1261, "朝": 1262, "期": 1263, "木": 1264, "未": 1265, "末": 1266, "本": 1267, "札": 1268, "朱": 1269, "朵": 1270, "朽": 1271, "杉": 1272, "李": 1273, "材": 1274, "村": 1275, "杖": 1276, "杜": 1277, "束": 1278, "杭": 1279, "杯": 1280, "杰": 1281, "東": 1282, "松": 1283, "板": 1284, "枉": 1285, "析": 1286, "枕": 1287, "林": 1288, "枚": 1289, "果": 1290, "枝": 1291, "枯": 1292, "架": 1293, "柏": 1294, "某": 1295, "染": 1296, "柔": 1297, "柚": 1298, "查": 1299, "柯": 1300, "柱": 1301, "柳": 1302, "柴": 1303, "柵": 1304, "柺": 1305, "柿": 1306, "栗": 1307, "校": 1308, "栩": 1309, "株": 1310, "核": 1311, "根": 1312, "格": 1313, "栽": 1314, "桂": 1315, "桃": 1316, "框": 1317, "案": 1318, "桌": 1319, "桶": 1320, "桿": 1321, "梁": 1322, "梅": 1323, "梗": 1324, "條": 1325, "梨": 1326, "梯": 1327, "梳": 1328, "棄": 1329, "棉": 1330, "棋": 1331, "棍": 1332, "棒": 1333, "棚": 1334, "棟": 1335, "森": 1336, "棲": 1337, "棵": 1338, "棺": 1339, "椅": 1340, "植": 1341, "椎": 1342, "椒": 1343, "椰": 1344, "楊": 1345, "楚": 1346, "楞": 1347, "業": 1348, "極": 1349, "概": 1350, "榔": 1351, "榕": 1352, "榜": 1353, "榨": 1354, "榮": 1355, "榻": 1356, "構": 1357, "槍": 1358, "槓": 1359, "樁": 1360, "樂": 1361, "樑": 1362, "樓": 1363, "標": 1364, "樞": 1365, "模": 1366, "樣": 1367, "樵": 1368, "樸": 1369, "樹": 1370, "樺": 1371, "樽": 1372, "橋": 1373, "橘": 1374, "橙": 1375, "機": 1376, "橡": 1377, "橫": 1378, "檔": 1379, "檢": 1380, "檬": 1381, "檯": 1382, "檳": 1383, "檸": 1384, "檻": 1385, "櫃": 1386, "櫻": 1387, "欄": 1388, "欉": 1389, "權": 1390, "欠": 1391, "次": 1392, "欣": 1393, "欲": 1394, "欸": 1395, "欺": 1396, "欽": 1397, "款": 1398, "歉": 1399, "歌": 1400, "歐": 1401, "歡": 1402, "止": 1403, "正": 1404, "此": 1405, "步": 1406, "武": 1407, "歧": 1408, "歲": 1409, "歷": 1410, "歸": 1411, "死": 1412, "殊": 1413, "殖": 1414, "殘": 1415, "殭": 1416, "段": 1417, "殷": 1418, "殺": 1419, "殼": 1420, "殿": 1421, "毀": 1422, "毋": 1423, "母": 1424, "每": 1425, "毒": 1426, "比": 1427, "毛": 1428, "毫": 1429, "氏": 1430, "民": 1431, "氘": 1432, "氚": 1433, "氛": 1434, "氣": 1435, "氧": 1436, "氨": 1437, "氫": 1438, "氮": 1439, "氰": 1440, "水": 1441, "永": 1442, "氾": 1443, "汀": 1444, "汁": 1445, "求": 1446, "汐": 1447, "汗": 1448, "汙": 1449, "汛": 1450, "汝": 1451, "江": 1452, "池": 1453, "污": 1454, "汪": 1455, "汰": 1456, "決": 1457, "汽": 1458, "汾": 1459, "沃": 1460, "沈": 1461, "沉": 1462, "沐": 1463, "沒": 1464, "沖": 1465, "沙": 1466, "沛": 1467, "没": 1468, "沮": 1469, "沱": 1470, "河": 1471, "沸": 1472, "油": 1473, "治": 1474, "沼": 1475, "沾": 1476, "沿": 1477, "況": 1478, "泉": 1479, "泊": 1480, "法": 1481, "泛": 1482, "泡": 1483, "波": 1484, "泣": 1485, "泥": 1486, "注": 1487, "泰": 1488, "泳": 1489, "洋": 1490, "洗": 1491, "洛": 1492, "洞": 1493, "津": 1494, "洪": 1495, "洱": 1496, "洲": 1497, "活": 1498, "洽": 1499, "派": 1500, "流": 1501, "浩": 1502, "浪": 1503, "浮": 1504, "浴": 1505, "海": 1506, "消": 1507, "涉": 1508, "涯": 1509, "液": 1510, "涵": 1511, "涸": 1512, "涼": 1513, "淇": 1514, "淋": 1515, "淑": 1516, "淒": 1517, "淘": 1518, "淚": 1519, "淡": 1520, "淤": 1521, "淨": 1522, "淪": 1523, "深": 1524, "淵": 1525, "混": 1526, "淹": 1527, "淺": 1528, "添": 1529, "清": 1530, "減": 1531, "渡": 1532, "渣": 1533, "渥": 1534, "渦": 1535, "測": 1536, "渭": 1537, "港": 1538, "渲": 1539, "渴": 1540, "游": 1541, "渾": 1542, "湊": 1543, "湍": 1544, "湖": 1545, "湧": 1546, "湯": 1547, "溉": 1548, "源": 1549, "準": 1550, "溝": 1551, "溪": 1552, "溫": 1553, "溶": 1554, "溺": 1555, "溼": 1556, "滂": 1557, "滅": 1558, "滋": 1559, "滌": 1560, "滑": 1561, "滯": 1562, "滲": 1563, "滴": 1564, "滷": 1565, "滾": 1566, "滿": 1567, "漁": 1568, "漂": 1569, "漆": 1570, "漏": 1571, "演": 1572, "漠": 1573, "漢": 1574, "漫": 1575, "漲": 1576, "漸": 1577, "潑": 1578, "潔": 1579, "潛": 1580, "潤": 1581, "潦": 1582, "潮": 1583, "潰": 1584, "澄": 1585, "澎": 1586, "澡": 1587, "澤": 1588, "澳": 1589, "激": 1590, "濃": 1591, "濕": 1592, "濟": 1593, "濫": 1594, "濱": 1595, "濾": 1596, "瀏": 1597, "瀑": 1598, "瀕": 1599, "灌": 1600, "灑": 1601, "灘": 1602, "灣": 1603, "火": 1604, "灰": 1605, "災": 1606, "炎": 1607, "炒": 1608, "炫": 1609, "炮": 1610, "炸": 1611, "為": 1612, "烈": 1613, "烊": 1614, "烏": 1615, "烤": 1616, "烹": 1617, "焉": 1618, "焚": 1619, "無": 1620, "焦": 1621, "焰": 1622, "然": 1623, "煎": 1624, "煙": 1625, "煤": 1626, "煦": 1627, "照": 1628, "煩": 1629, "煮": 1630, "熊": 1631, "熔": 1632, "熟": 1633, "熬": 1634, "熱": 1635, "燃": 1636, "燈": 1637, "燒": 1638, "燕": 1639, "燙": 1640, "營": 1641, "燥": 1642, "燦": 1643, "燭": 1644, "爆": 1645, "爍": 1646, "爐": 1647, "爛": 1648, "爪": 1649, "爬": 1650, "爭": 1651, "爲": 1652, "父": 1653, "爸": 1654, "爹": 1655, "爺": 1656, "爽": 1657, "爾": 1658, "牆": 1659, "片": 1660, "版": 1661, "牌": 1662, "牙": 1663, "牛": 1664, "牠": 1665, "牡": 1666, "牧": 1667, "物": 1668, "牲": 1669, "牴": 1670, "特": 1671, "牽": 1672, "犬": 1673, "犯": 1674, "狀": 1675, "狂": 1676, "狐": 1677, "狗": 1678, "狠": 1679, "狸": 1680, "狹": 1681, "狼": 1682, "猛": 1683, "猜": 1684, "猩": 1685, "猴": 1686, "猶": 1687, "猿": 1688, "獄": 1689, "獅": 1690, "獎": 1691, "獨": 1692, "獲": 1693, "獵": 1694, "獸": 1695, "獻": 1696, "玄": 1697, "率": 1698, "玉": 1699, "王": 1700, "玩": 1701, "玫": 1702, "玻": 1703, "珀": 1704, "珈": 1705, "珍": 1706, "珠": 1707, "班": 1708, "現": 1709, "球": 1710, "理": 1711, "琥": 1712, "琪": 1713, "琴": 1714, "瑋": 1715, "瑕": 1716, "瑜": 1717, "瑟": 1718, "瑩": 1719, "瑪": 1720, "瑰": 1721, "璃": 1722, "璧": 1723, "環": 1724, "瓜": 1725, "瓦": 1726, "瓶": 1727, "瓷": 1728, "甄": 1729, "甘": 1730, "甚": 1731, "甜": 1732, "生": 1733, "產": 1734, "甦": 1735, "用": 1736, "甩": 1737, "甫": 1738, "田": 1739, "由": 1740, "甲": 1741, "申": 1742, "男": 1743, "町": 1744, "界": 1745, "畏": 1746, "畔": 1747, "留": 1748, "畜": 1749, "畢": 1750, "略": 1751, "番": 1752, "畫": 1753, "異": 1754, "當": 1755, "畸": 1756, "疆": 1757, "疊": 1758, "疏": 1759, "疑": 1760, "疙": 1761, "疫": 1762, "疲": 1763, "疼": 1764, "疾": 1765, "病": 1766, "症": 1767, "痕": 1768, "痛": 1769, "痠": 1770, "痰": 1771, "痴": 1772, "瘋": 1773, "瘟": 1774, "瘦": 1775, "瘩": 1776, "療": 1777, "癌": 1778, "癒": 1779, "癡": 1780, "癢": 1781, "癮": 1782, "癱": 1783, "登": 1784, "發": 1785, "白": 1786, "百": 1787, "皂": 1788, "的": 1789, "皆": 1790, "皇": 1791, "皮": 1792, "皺": 1793, "盃": 1794, "盆": 1795, "益": 1796, "盎": 1797, "盒": 1798, "盔": 1799, "盛": 1800, "盜": 1801, "盞": 1802, "盟": 1803, "盡": 1804, "監": 1805, "盤": 1806, "盪": 1807, "目": 1808, "盲": 1809, "直": 1810, "相": 1811, "盼": 1812, "盾": 1813, "省": 1814, "眉": 1815, "看": 1816, "真": 1817, "眠": 1818, "眨": 1819, "眷": 1820, "眺": 1821, "眼": 1822, "眾": 1823, "睛": 1824, "睞": 1825, "睡": 1826, "督": 1827, "睦": 1828, "瞌": 1829, "瞧": 1830, "瞪": 1831, "瞬": 1832, "瞭": 1833, "瞻": 1834, "矛": 1835, "知": 1836, "矩": 1837, "短": 1838, "矮": 1839, "矲": 1840, "石": 1841, "砂": 1842, "研": 1843, "砲": 1844, "破": 1845, "砷": 1846, "硃": 1847, "硫": 1848, "硬": 1849, "碌": 1850, "碎": 1851, "碑": 1852, "碗": 1853, "碟": 1854, "碧": 1855, "碩": 1856, "碰": 1857, "碳": 1858, "確": 1859, "碼": 1860, "碾": 1861, "磁": 1862, "磐": 1863, "磚": 1864, "磨": 1865, "礁": 1866, "礎": 1867, "礙": 1868, "示": 1869, "社": 1870, "祈": 1871, "祉": 1872, "祕": 1873, "祖": 1874, "祝": 1875, "神": 1876, "祟": 1877, "祥": 1878, "票": 1879, "祭": 1880, "祿": 1881, "禁": 1882, "禍": 1883, "福": 1884, "禦": 1885, "禧": 1886, "禮": 1887, "禱": 1888, "禿": 1889, "秀": 1890, "私": 1891, "秉": 1892, "秋": 1893, "科": 1894, "秒": 1895, "秘": 1896, "租": 1897, "秦": 1898, "秧": 1899, "秩": 1900, "移": 1901, "稀": 1902, "稅": 1903, "程": 1904, "稍": 1905, "稚": 1906, "稜": 1907, "稠": 1908, "種": 1909, "稱": 1910, "稻": 1911, "稽": 1912, "稿": 1913, "穀": 1914, "積": 1915, "穗": 1916, "穩": 1917, "穴": 1918, "究": 1919, "空": 1920, "穿": 1921, "突": 1922, "窄": 1923, "窗": 1924, "窘": 1925, "窮": 1926, "窯": 1927, "竄": 1928, "竅": 1929, "竇": 1930, "立": 1931, "站": 1932, "竟": 1933, "章": 1934, "童": 1935, "竭": 1936, "端": 1937, "競": 1938, "竹": 1939, "竿": 1940, "笑": 1941, "符": 1942, "笨": 1943, "第": 1944, "筆": 1945, "等": 1946, "筋": 1947, "筍": 1948, "筐": 1949, "筒": 1950, "答": 1951, "策": 1952, "箕": 1953, "算": 1954, "管": 1955, "箭": 1956, "箱": 1957, "節": 1958, "範": 1959, "篇": 1960, "築": 1961, "篡": 1962, "篩": 1963, "篷": 1964, "簡": 1965, "簷": 1966, "簽": 1967, "簿": 1968, "籃": 1969, "籌": 1970, "籍": 1971, "籠": 1972, "籤": 1973, "籮": 1974, "籲": 1975, "米": 1976, "籽": 1977, "粉": 1978, "粒": 1979, "粗": 1980, "粥": 1981, "粵": 1982, "粹": 1983, "粽": 1984, "精": 1985, "粿": 1986, "糊": 1987, "糕": 1988, "糖": 1989, "糞": 1990, "糟": 1991, "糧": 1992, "糬": 1993, "糰": 1994, "系": 1995, "糾": 1996, "紀": 1997, "約": 1998, "紅": 1999, "紋": 2000, "納": 2001, "紐": 2002, "紓": 2003, "純": 2004, "紗": 2005, "紙": 2006, "級": 2007, "紛": 2008, "素": 2009, "紡": 2010, "索": 2011, "紫": 2012, "累": 2013, "細": 2014, "紹": 2015, "終": 2016, "組": 2017, "絆": 2018, "結": 2019, "絕": 2020, "絡": 2021, "給": 2022, "絨": 2023, "統": 2024, "絲": 2025, "綁": 2026, "經": 2027, "綜": 2028, "綠": 2029, "綢": 2030, "維": 2031, "綱": 2032, "網": 2033, "綴": 2034, "綿": 2035, "緊": 2036, "緒": 2037, "線": 2038, "締": 2039, "緣": 2040, "編": 2041, "緩": 2042, "緯": 2043, "練": 2044, "緻": 2045, "縈": 2046, "縝": 2047, "縣": 2048, "縫": 2049, "縮": 2050, "縱": 2051, "總": 2052, "績": 2053, "繁": 2054, "繆": 2055, "織": 2056, "繞": 2057, "繩": 2058, "繪": 2059, "繫": 2060, "繳": 2061, "繹": 2062, "繼": 2063, "續": 2064, "纏": 2065, "纜": 2066, "缸": 2067, "缺": 2068, "罄": 2069, "罐": 2070, "罕": 2071, "罩": 2072, "罪": 2073, "置": 2074, "罰": 2075, "署": 2076, "罵": 2077, "罷": 2078, "罹": 2079, "羅": 2080, "羈": 2081, "羊": 2082, "美": 2083, "羞": 2084, "群": 2085, "羨": 2086, "義": 2087, "羽": 2088, "翁": 2089, "翅": 2090, "習": 2091, "翹": 2092, "翻": 2093, "翼": 2094, "耀": 2095, "老": 2096, "考": 2097, "者": 2098, "而": 2099, "耍": 2100, "耐": 2101, "耕": 2102, "耗": 2103, "耘": 2104, "耳": 2105, "耶": 2106, "耽": 2107, "聆": 2108, "聊": 2109, "聖": 2110, "聘": 2111, "聚": 2112, "聞": 2113, "聯": 2114, "聰": 2115, "聲": 2116, "聳": 2117, "職": 2118, "聽": 2119, "肅": 2120, "肉": 2121, "肋": 2122, "肌": 2123, "肖": 2124, "肘": 2125, "肚": 2126, "肝": 2127, "股": 2128, "肢": 2129, "肥": 2130, "肩": 2131, "肯": 2132, "育": 2133, "肺": 2134, "肽": 2135, "胃": 2136, "背": 2137, "胎": 2138, "胖": 2139, "胚": 2140, "胜": 2141, "胞": 2142, "胡": 2143, "胥": 2144, "胯": 2145, "胸": 2146, "能": 2147, "脂": 2148, "脅": 2149, "脆": 2150, "脈": 2151, "脊": 2152, "脖": 2153, "脫": 2154, "脹": 2155, "脾": 2156, "腐": 2157, "腔": 2158, "腕": 2159, "腦": 2160, "腫": 2161, "腮": 2162, "腰": 2163, "腱": 2164, "腳": 2165, "腸": 2166, "腹": 2167, "腿": 2168, "膀": 2169, "膏": 2170, "膚": 2171, "膠": 2172, "膨": 2173, "膩": 2174, "膽": 2175, "臂": 2176, "臃": 2177, "臉": 2178, "臘": 2179, "臟": 2180, "臣": 2181, "臥": 2182, "臨": 2183, "自": 2184, "臭": 2185, "至": 2186, "致": 2187, "臺": 2188, "臼": 2189, "舅": 2190, "與": 2191, "興": 2192, "舉": 2193, "舊": 2194, "舌": 2195, "舍": 2196, "舒": 2197, "舔": 2198, "舞": 2199, "舟": 2200, "航": 2201, "般": 2202, "舵": 2203, "船": 2204, "舺": 2205, "艇": 2206, "艋": 2207, "艘": 2208, "艦": 2209, "良": 2210, "艱": 2211, "色": 2212, "艷": 2213, "艾": 2214, "芋": 2215, "芒": 2216, "芙": 2217, "芝": 2218, "芥": 2219, "芬": 2220, "芭": 2221, "花": 2222, "芳": 2223, "芹": 2224, "芽": 2225, "苓": 2226, "苗": 2227, "苟": 2228, "若": 2229, "苦": 2230, "苪": 2231, "英": 2232, "茂": 2233, "范": 2234, "茄": 2235, "茅": 2236, "茫": 2237, "茱": 2238, "茲": 2239, "茶": 2240, "草": 2241, "荒": 2242, "荔": 2243, "荷": 2244, "莊": 2245, "莎": 2246, "莓": 2247, "莫": 2248, "莽": 2249, "菁": 2250, "菅": 2251, "菇": 2252, "菊": 2253, "菌": 2254, "菜": 2255, "華": 2256, "菲": 2257, "菸": 2258, "萄": 2259, "萊": 2260, "萌": 2261, "萬": 2262, "萱": 2263, "落": 2264, "葉": 2265, "著": 2266, "葛": 2267, "葡": 2268, "董": 2269, "葩": 2270, "葬": 2271, "蒂": 2272, "蒐": 2273, "蒙": 2274, "蒜": 2275, "蒨": 2276, "蒸": 2277, "蒼": 2278, "蓉": 2279, "蓋": 2280, "蓮": 2281, "蔔": 2282, "蔗": 2283, "蔚": 2284, "蔡": 2285, "蔣": 2286, "蔥": 2287, "蔭": 2288, "蕉": 2289, "蕩": 2290, "蕾": 2291, "薄": 2292, "薏": 2293, "薑": 2294, "薛": 2295, "薦": 2296, "薩": 2297, "薪": 2298, "薯": 2299, "藉": 2300, "藍": 2301, "藏": 2302, "藝": 2303, "藤": 2304, "藥": 2305, "藩": 2306, "蘆": 2307, "蘇": 2308, "蘊": 2309, "蘋": 2310, "蘑": 2311, "蘭": 2312, "蘿": 2313, "虎": 2314, "虐": 2315, "處": 2316, "虛": 2317, "虞": 2318, "號": 2319, "虧": 2320, "蚊": 2321, "蚤": 2322, "蚵": 2323, "蛇": 2324, "蛋": 2325, "蛙": 2326, "蛛": 2327, "蛤": 2328, "蛻": 2329, "蜂": 2330, "蜘": 2331, "蜜": 2332, "蝦": 2333, "蝴": 2334, "蝶": 2335, "蝸": 2336, "螂": 2337, "螃": 2338, "融": 2339, "螞": 2340, "螢": 2341, "螺": 2342, "蟑": 2343, "蟬": 2344, "蟲": 2345, "蟹": 2346, "蟻": 2347, "蠅": 2348, "蠟": 2349, "蠢": 2350, "蠣": 2351, "蠱": 2352, "蠻": 2353, "血": 2354, "衆": 2355, "行": 2356, "衍": 2357, "術": 2358, "街": 2359, "衛": 2360, "衝": 2361, "衡": 2362, "衣": 2363, "表": 2364, "衫": 2365, "衰": 2366, "袁": 2367, "袋": 2368, "袍": 2369, "袖": 2370, "被": 2371, "袱": 2372, "裁": 2373, "裂": 2374, "裎": 2375, "裏": 2376, "裕": 2377, "裙": 2378, "補": 2379, "裝": 2380, "裡": 2381, "裸": 2382, "製": 2383, "複": 2384, "褐": 2385, "褲": 2386, "襄": 2387, "襪": 2388, "襲": 2389, "西": 2390, "要": 2391, "覆": 2392, "見": 2393, "規": 2394, "覓": 2395, "視": 2396, "親": 2397, "覺": 2398, "覽": 2399, "觀": 2400, "角": 2401, "解": 2402, "觸": 2403, "言": 2404, "訂": 2405, "計": 2406, "訊": 2407, "討": 2408, "訐": 2409, "訓": 2410, "訕": 2411, "託": 2412, "記": 2413, "訝": 2414, "訟": 2415, "訣": 2416, "訪": 2417, "設": 2418, "許": 2419, "訴": 2420, "診": 2421, "註": 2422, "証": 2423, "詐": 2424, "評": 2425, "詛": 2426, "詞": 2427, "詡": 2428, "詢": 2429, "試": 2430, "詩": 2431, "詬": 2432, "詭": 2433, "話": 2434, "該": 2435, "詳": 2436, "詹": 2437, "誇": 2438, "誌": 2439, "認": 2440, "誓": 2441, "誕": 2442, "誘": 2443, "語": 2444, "誠": 2445, "誡": 2446, "誣": 2447, "誤": 2448, "說": 2449, "誰": 2450, "課": 2451, "誼": 2452, "調": 2453, "談": 2454, "請": 2455, "諒": 2456, "論": 2457, "諜": 2458, "諧": 2459, "諮": 2460, "諸": 2461, "諾": 2462, "謀": 2463, "謂": 2464, "謄": 2465, "謊": 2466, "謎": 2467, "講": 2468, "謝": 2469, "謠": 2470, "謹": 2471, "證": 2472, "識": 2473, "譚": 2474, "譜": 2475, "警": 2476, "譯": 2477, "議": 2478, "護": 2479, "譽": 2480, "讀": 2481, "變": 2482, "讓": 2483, "讚": 2484, "谷": 2485, "豆": 2486, "豈": 2487, "豐": 2488, "豚": 2489, "象": 2490, "豪": 2491, "豫": 2492, "豬": 2493, "貂": 2494, "貌": 2495, "貓": 2496, "貝": 2497, "貞": 2498, "負": 2499, "財": 2500, "貢": 2501, "貧": 2502, "貨": 2503, "販": 2504, "貪": 2505, "貫": 2506, "責": 2507, "貯": 2508, "貲": 2509, "貴": 2510, "買": 2511, "貸": 2512, "費": 2513, "貼": 2514, "貿": 2515, "賀": 2516, "賄": 2517, "賅": 2518, "資": 2519, "賈": 2520, "賊": 2521, "賓": 2522, "賜": 2523, "賞": 2524, "賠": 2525, "賢": 2526, "賣": 2527, "賤": 2528, "賦": 2529, "質": 2530, "賭": 2531, "賴": 2532, "賺": 2533, "購": 2534, "賽": 2535, "贈": 2536, "贊": 2537, "贏": 2538, "赤": 2539, "赫": 2540, "走": 2541, "赴": 2542, "起": 2543, "趁": 2544, "超": 2545, "越": 2546, "趕": 2547, "趙": 2548, "趟": 2549, "趣": 2550, "趨": 2551, "足": 2552, "趴": 2553, "跆": 2554, "跋": 2555, "跌": 2556, "跑": 2557, "跚": 2558, "距": 2559, "跟": 2560, "跡": 2561, "跨": 2562, "路": 2563, "跳": 2564, "踉": 2565, "踏": 2566, "踐": 2567, "踢": 2568, "踩": 2569, "踴": 2570, "踹": 2571, "蹄": 2572, "蹈": 2573, "蹌": 2574, "蹟": 2575, "蹣": 2576, "蹤": 2577, "蹦": 2578, "蹲": 2579, "蹺": 2580, "躁": 2581, "躍": 2582, "身": 2583, "躲": 2584, "躺": 2585, "軀": 2586, "車": 2587, "軌": 2588, "軍": 2589, "軟": 2590, "軸": 2591, "軾": 2592, "較": 2593, "載": 2594, "輒": 2595, "輔": 2596, "輕": 2597, "輝": 2598, "輩": 2599, "輪": 2600, "輯": 2601, "輸": 2602, "輻": 2603, "輾": 2604, "輿": 2605, "轄": 2606, "轉": 2607, "轍": 2608, "轎": 2609, "辛": 2610, "辜": 2611, "辟": 2612, "辣": 2613, "辦": 2614, "辨": 2615, "辭": 2616, "辯": 2617, "辱": 2618, "農": 2619, "迅": 2620, "迎": 2621, "近": 2622, "返": 2623, "迪": 2624, "迫": 2625, "述": 2626, "迴": 2627, "迷": 2628, "追": 2629, "退": 2630, "送": 2631, "逃": 2632, "逆": 2633, "透": 2634, "逐": 2635, "途": 2636, "逕": 2637, "逗": 2638, "這": 2639, "通": 2640, "逛": 2641, "逝": 2642, "逞": 2643, "速": 2644, "造": 2645, "逢": 2646, "連": 2647, "逮": 2648, "週": 2649, "進": 2650, "逼": 2651, "逾": 2652, "遁": 2653, "遇": 2654, "遊": 2655, "運": 2656, "遍": 2657, "過": 2658, "道": 2659, "達": 2660, "違": 2661, "遙": 2662, "遜": 2663, "遞": 2664, "遠": 2665, "適": 2666, "遭": 2667, "遮": 2668, "遲": 2669, "遴": 2670, "遵": 2671, "遷": 2672, "選": 2673, "遺": 2674, "遼": 2675, "遽": 2676, "避": 2677, "邀": 2678, "邁": 2679, "還": 2680, "邊": 2681, "邏": 2682, "那": 2683, "邦": 2684, "邪": 2685, "邱": 2686, "邵": 2687, "郁": 2688, "郊": 2689, "郎": 2690, "郝": 2691, "部": 2692, "郭": 2693, "郵": 2694, "都": 2695, "鄉": 2696, "鄧": 2697, "鄭": 2698, "鄰": 2699, "酋": 2700, "配": 2701, "酒": 2702, "酗": 2703, "酪": 2704, "酬": 2705, "酮": 2706, "酵": 2707, "酶": 2708, "酷": 2709, "酸": 2710, "醉": 2711, "醋": 2712, "醒": 2713, "醜": 2714, "醫": 2715, "醬": 2716, "釁": 2717, "采": 2718, "釋": 2719, "里": 2720, "重": 2721, "野": 2722, "量": 2723, "釐": 2724, "金": 2725, "釘": 2726, "針": 2727, "釣": 2728, "鈉": 2729, "鈔": 2730, "鈕": 2731, "鈣": 2732, "鈴": 2733, "鉅": 2734, "鉛": 2735, "鉤": 2736, "銀": 2737, "銘": 2738, "銜": 2739, "銷": 2740, "鋒": 2741, "鋪": 2742, "鋸": 2743, "鋼": 2744, "錄": 2745, "錐": 2746, "錢": 2747, "錦": 2748, "錨": 2749, "錫": 2750, "錯": 2751, "錶": 2752, "鍊": 2753, "鍋": 2754, "鍛": 2755, "鍵": 2756, "鍾": 2757, "鎂": 2758, "鎖": 2759, "鎮": 2760, "鏈": 2761, "鏡": 2762, "鐘": 2763, "鐵": 2764, "鑑": 2765, "鑰": 2766, "鑼": 2767, "長": 2768, "門": 2769, "閃": 2770, "閉": 2771, "開": 2772, "閒": 2773, "間": 2774, "閱": 2775, "闆": 2776, "闊": 2777, "闖": 2778, "關": 2779, "闢": 2780, "阱": 2781, "防": 2782, "阻": 2783, "阿": 2784, "陀": 2785, "附": 2786, "陌": 2787, "降": 2788, "限": 2789, "陡": 2790, "院": 2791, "陣": 2792, "除": 2793, "陪": 2794, "陰": 2795, "陳": 2796, "陵": 2797, "陶": 2798, "陷": 2799, "陸": 2800, "陽": 2801, "隆": 2802, "隊": 2803, "階": 2804, "隔": 2805, "際": 2806, "障": 2807, "隧": 2808, "隨": 2809, "險": 2810, "隱": 2811, "隻": 2812, "雀": 2813, "雄": 2814, "雅": 2815, "集": 2816, "雇": 2817, "雌": 2818, "雕": 2819, "雖": 2820, "雙": 2821, "雛": 2822, "雜": 2823, "雞": 2824, "離": 2825, "難": 2826, "雨": 2827, "雪": 2828, "雲": 2829, "零": 2830, "雷": 2831, "電": 2832, "需": 2833, "霄": 2834, "震": 2835, "霉": 2836, "霍": 2837, "霖": 2838, "霧": 2839, "露": 2840, "霸": 2841, "霹": 2842, "��": 2843, "靈": 2844, "青": 2845, "靜": 2846, "非": 2847, "靠": 2848, "靡": 2849, "面": 2850, "革": 2851, "靭": 2852, "靴": 2853, "鞋": 2854, "鞏": 2855, "鞭": 2856, "韋": 2857, "韓": 2858, "音": 2859, "韻": 2860, "響": 2861, "頁": 2862, "頂": 2863, "頃": 2864, "項": 2865, "順": 2866, "須": 2867, "預": 2868, "頓": 2869, "頗": 2870, "領": 2871, "頭": 2872, "頰": 2873, "頸": 2874, "頻": 2875, "顆": 2876, "題": 2877, "額": 2878, "顏": 2879, "願": 2880, "類": 2881, "顧": 2882, "顯": 2883, "顰": 2884, "風": 2885, "颱": 2886, "飄": 2887, "飆": 2888, "飛": 2889, "食": 2890, "飩": 2891, "飪": 2892, "飯": 2893, "飲": 2894, "飽": 2895, "飾": 2896, "餃": 2897, "餅": 2898, "養": 2899, "餐": 2900, "餓": 2901, "餘": 2902, "餚": 2903, "餛": 2904, "餡": 2905, "館": 2906, "餮": 2907, "餵": 2908, "饋": 2909, "饒": 2910, "饕": 2911, "饗": 2912, "首": 2913, "香": 2914, "馥": 2915, "馨": 2916, "馬": 2917, "馭": 2918, "馳": 2919, "馴": 2920, "駁": 2921, "駐": 2922, "駕": 2923, "駛": 2924, "駭": 2925, "騎": 2926, "騙": 2927, "騰": 2928, "騷": 2929, "騾": 2930, "驅": 2931, "驕": 2932, "驗": 2933, "驚": 2934, "驟": 2935, "骨": 2936, "骼": 2937, "髒": 2938, "髓": 2939, "體": 2940, "高": 2941, "髮": 2942, "鬆": 2943, "鬍": 2944, "鬚": 2945, "鬥": 2946, "鬧": 2947, "鬱": 2948, "鬼": 2949, "魂": 2950, "魄": 2951, "魅": 2952, "魍": 2953, "魎": 2954, "魏": 2955, "魑": 2956, "魔": 2957, "魚": 2958, "魯": 2959, "魷": 2960, "鮪": 2961, "鮭": 2962, "鮮": 2963, "鯊": 2964, "鯛": 2965, "鯨": 2966, "鰈": 2967, "鰜": 2968, "鰭": 2969, "鰻": 2970, "鱔": 2971, "鱷": 2972, "鳥": 2973, "鳩": 2974, "鳳": 2975, "鳴": 2976, "鴉": 2977, "鴨": 2978, "鴻": 2979, "鵝": 2980, "鵲": 2981, "鶯": 2982, "鶴": 2983, "鷹": 2984, "鹹": 2985, "鹽": 2986, "鹿": 2987, "麗": 2988, "麥": 2989, "麵": 2990, "麻": 2991, "麼": 2992, "麽": 2993, "黃": 2994, "黎": 2995, "黑": 2996, "默": 2997, "點": 2998, "黨": 2999, "黯": 3000, "鼎": 3001, "鼓": 3002, "鼠": 3003, "鼻": 3004, "齁": 3005, "齊": 3006, "齋": 3007, "齒": 3008, "齡": 3009, "龍": 3010, "龐": 3011, "龜": 3012, "!": 3013, ",": 3014, ":": 3015, ";": 3016, "?": 3017, "a": 3018, "b": 3019, "f": 3020, "g": 3021, "i": 3022, "n": 3023, "p": 3024, "t": 3025, "~": 3026, "|": 0, "[UNK]": 3027, "[PAD]": 3028}