PereLluis13
commited on
Commit
·
6ef7fcf
1
Parent(s):
fc69d43
First model version
Browse files- .gitignore +1 -0
- added_tokens.json +1 -0
- alphabet.json +1 -0
- config.json +107 -0
- language_model/5gram.bin +3 -0
- language_model/attrs.json +1 -0
- language_model/unigrams.txt +0 -0
- preprocessor_config.json +9 -0
- pytorch_model.bin +3 -0
- run.sh +39 -0
- run_speech_recognition_ctc.py +814 -0
- special_tokens_map.json +1 -0
- text/LICENSE +19 -0
- text/__init__.py +74 -0
- text/ca.sor +485 -0
- text/cleaners.py +150 -0
- text/cmudict.py +65 -0
- text/numbers.py +71 -0
- text/numbers_ca.py +54 -0
- text/numbers_ca_test.py +106 -0
- text/soros.py +140 -0
- text/symbols.py +17 -0
- text/symbols_en.py +18 -0
- tokenizer_config.json +1 -0
- training_args.bin +3 -0
- vocab.json +1 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
checkpoint-*/
|
added_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<s>": 44, "</s>": 45}
|
alphabet.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"labels": [" ", "#", "'", "-", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00b7", "\u00e0", "\u00e7", "\u00e8", "\u00e9", "\u00ed", "\u00ef", "\u00f2", "\u00f3", "\u00fa", "\u00fc", "\u0903", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
|
config.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "facebook/wav2vec2-xls-r-300m",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"adapter_kernel_size": 3,
|
5 |
+
"adapter_stride": 2,
|
6 |
+
"add_adapter": false,
|
7 |
+
"apply_spec_augment": true,
|
8 |
+
"architectures": [
|
9 |
+
"Wav2Vec2ForCTC"
|
10 |
+
],
|
11 |
+
"attention_dropout": 0.0,
|
12 |
+
"bos_token_id": 1,
|
13 |
+
"classifier_proj_size": 256,
|
14 |
+
"codevector_dim": 768,
|
15 |
+
"contrastive_logits_temperature": 0.1,
|
16 |
+
"conv_bias": true,
|
17 |
+
"conv_dim": [
|
18 |
+
512,
|
19 |
+
512,
|
20 |
+
512,
|
21 |
+
512,
|
22 |
+
512,
|
23 |
+
512,
|
24 |
+
512
|
25 |
+
],
|
26 |
+
"conv_kernel": [
|
27 |
+
10,
|
28 |
+
3,
|
29 |
+
3,
|
30 |
+
3,
|
31 |
+
3,
|
32 |
+
2,
|
33 |
+
2
|
34 |
+
],
|
35 |
+
"conv_stride": [
|
36 |
+
5,
|
37 |
+
2,
|
38 |
+
2,
|
39 |
+
2,
|
40 |
+
2,
|
41 |
+
2,
|
42 |
+
2
|
43 |
+
],
|
44 |
+
"ctc_loss_reduction": "mean",
|
45 |
+
"ctc_zero_infinity": false,
|
46 |
+
"diversity_loss_weight": 0.1,
|
47 |
+
"do_stable_layer_norm": true,
|
48 |
+
"eos_token_id": 2,
|
49 |
+
"feat_extract_activation": "gelu",
|
50 |
+
"feat_extract_dropout": 0.0,
|
51 |
+
"feat_extract_norm": "layer",
|
52 |
+
"feat_proj_dropout": 0.0,
|
53 |
+
"feat_quantizer_dropout": 0.0,
|
54 |
+
"final_dropout": 0.0,
|
55 |
+
"hidden_act": "gelu",
|
56 |
+
"hidden_dropout": 0.0,
|
57 |
+
"hidden_size": 1024,
|
58 |
+
"initializer_range": 0.02,
|
59 |
+
"intermediate_size": 4096,
|
60 |
+
"layer_norm_eps": 1e-05,
|
61 |
+
"layerdrop": 0.0,
|
62 |
+
"mask_feature_length": 64,
|
63 |
+
"mask_feature_min_masks": 0,
|
64 |
+
"mask_feature_prob": 0.25,
|
65 |
+
"mask_time_length": 10,
|
66 |
+
"mask_time_min_masks": 2,
|
67 |
+
"mask_time_prob": 0.75,
|
68 |
+
"model_type": "wav2vec2",
|
69 |
+
"num_adapter_layers": 3,
|
70 |
+
"num_attention_heads": 16,
|
71 |
+
"num_codevector_groups": 2,
|
72 |
+
"num_codevectors_per_group": 320,
|
73 |
+
"num_conv_pos_embedding_groups": 16,
|
74 |
+
"num_conv_pos_embeddings": 128,
|
75 |
+
"num_feat_extract_layers": 7,
|
76 |
+
"num_hidden_layers": 24,
|
77 |
+
"num_negatives": 100,
|
78 |
+
"output_hidden_size": 1024,
|
79 |
+
"pad_token_id": 43,
|
80 |
+
"proj_codevector_dim": 768,
|
81 |
+
"tdnn_dilation": [
|
82 |
+
1,
|
83 |
+
2,
|
84 |
+
3,
|
85 |
+
1,
|
86 |
+
1
|
87 |
+
],
|
88 |
+
"tdnn_dim": [
|
89 |
+
512,
|
90 |
+
512,
|
91 |
+
512,
|
92 |
+
512,
|
93 |
+
1500
|
94 |
+
],
|
95 |
+
"tdnn_kernel": [
|
96 |
+
5,
|
97 |
+
3,
|
98 |
+
3,
|
99 |
+
1,
|
100 |
+
1
|
101 |
+
],
|
102 |
+
"torch_dtype": "float32",
|
103 |
+
"transformers_version": "4.16.0.dev0",
|
104 |
+
"use_weighted_layer_sum": false,
|
105 |
+
"vocab_size": 46,
|
106 |
+
"xvector_output_dim": 512
|
107 |
+
}
|
language_model/5gram.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af7f2cbb75fbcb5091a6a24c9b6984b81a0c257652ca96f7b434f5b6e5224740
|
3 |
+
size 630272442
|
language_model/attrs.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
|
language_model/unigrams.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8266cc58fff449061f92483f6f7afebb0e00e0f6bba42881c5d847151579245c
|
3 |
+
size 1262112241
|
run.sh
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python run_speech_recognition_ctc.py \
|
2 |
+
--dataset_name "mozilla-foundation/common_voice_8_0" "collectivat/tv3_parla" "projecte-aina/parlament_parla" \
|
3 |
+
--dataset_config_name "ca" "ca" "clean" \
|
4 |
+
--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
|
5 |
+
--train_split_name "train+validation" "train" "train+validation" \
|
6 |
+
--eval_split_name "test" "test" "test" \
|
7 |
+
--audio_column_name "audio" "audio" "audio" \
|
8 |
+
--output_dir="wav2vec2-xls-r-300m-ca" \
|
9 |
+
--overwrite_output_dir \
|
10 |
+
--num_train_epochs="10" \
|
11 |
+
--per_device_train_batch_size="32" \
|
12 |
+
--per_device_eval_batch_size="32" \
|
13 |
+
--gradient_accumulation_steps="4" \
|
14 |
+
--learning_rate="7.5e-5" \
|
15 |
+
--warmup_steps="2000" \
|
16 |
+
--length_column_name="input_length" \
|
17 |
+
--evaluation_strategy="steps" \
|
18 |
+
--text_column_name "sentence" "text" "sentence" \
|
19 |
+
--chars_to_ignore [ , ? . ! \; \: \" “ % ” � — … – ] \
|
20 |
+
--save_steps="500" \
|
21 |
+
--eval_steps="500" \
|
22 |
+
--logging_steps="500" \
|
23 |
+
--layerdrop="0.0" \
|
24 |
+
--activation_dropout="0.1" \
|
25 |
+
--save_total_limit="3" \
|
26 |
+
--freeze_feature_encoder \
|
27 |
+
--feat_proj_dropout="0.0" \
|
28 |
+
--mask_time_prob="0.75" \
|
29 |
+
--preprocessing_num_workers="12" \
|
30 |
+
--mask_time_length="10" \
|
31 |
+
--mask_feature_prob="0.25" \
|
32 |
+
--mask_feature_length="64" \
|
33 |
+
--gradient_checkpointing \
|
34 |
+
--use_auth_token \
|
35 |
+
--fp16 \
|
36 |
+
--group_by_length \
|
37 |
+
--do_train --do_eval \
|
38 |
+
--push_to_hub
|
39 |
+
#&> train.log
|
run_speech_recognition_ctc.py
ADDED
@@ -0,0 +1,814 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding=utf-8
|
3 |
+
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
|
16 |
+
""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
|
17 |
+
|
18 |
+
import functools
|
19 |
+
import json
|
20 |
+
import logging
|
21 |
+
import os
|
22 |
+
import re
|
23 |
+
import sys
|
24 |
+
import warnings
|
25 |
+
from dataclasses import dataclass, field
|
26 |
+
from typing import Dict, List, Optional, Union
|
27 |
+
from text.numbers_ca import normalize_numbers_ca
|
28 |
+
|
29 |
+
import datasets
|
30 |
+
import numpy as np
|
31 |
+
import torch
|
32 |
+
from datasets import DatasetDict, load_dataset, load_metric, concatenate_datasets
|
33 |
+
|
34 |
+
import transformers
|
35 |
+
from transformers import (
|
36 |
+
AutoConfig,
|
37 |
+
AutoFeatureExtractor,
|
38 |
+
AutoModelForCTC,
|
39 |
+
AutoProcessor,
|
40 |
+
AutoTokenizer,
|
41 |
+
HfArgumentParser,
|
42 |
+
Trainer,
|
43 |
+
TrainingArguments,
|
44 |
+
Wav2Vec2Processor,
|
45 |
+
set_seed,
|
46 |
+
)
|
47 |
+
from transformers.trainer_utils import get_last_checkpoint, is_main_process
|
48 |
+
from transformers.utils import check_min_version
|
49 |
+
from transformers.utils.versions import require_version
|
50 |
+
|
51 |
+
|
52 |
+
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
53 |
+
check_min_version("4.16.0.dev0")
|
54 |
+
|
55 |
+
require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
56 |
+
|
57 |
+
|
58 |
+
logger = logging.getLogger(__name__)
|
59 |
+
|
60 |
+
|
61 |
+
def list_field(default=None, metadata=None):
|
62 |
+
return field(default_factory=lambda: default, metadata=metadata)
|
63 |
+
|
64 |
+
|
65 |
+
@dataclass
|
66 |
+
class ModelArguments:
|
67 |
+
"""
|
68 |
+
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
|
69 |
+
"""
|
70 |
+
|
71 |
+
model_name_or_path: str = field(
|
72 |
+
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
|
73 |
+
)
|
74 |
+
tokenizer_name_or_path: Optional[str] = field(
|
75 |
+
default=None,
|
76 |
+
metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
|
77 |
+
)
|
78 |
+
cache_dir: Optional[str] = field(
|
79 |
+
default=None,
|
80 |
+
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
81 |
+
)
|
82 |
+
freeze_feature_encoder: bool = field(
|
83 |
+
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
|
84 |
+
)
|
85 |
+
attention_dropout: float = field(
|
86 |
+
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
|
87 |
+
)
|
88 |
+
activation_dropout: float = field(
|
89 |
+
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
|
90 |
+
)
|
91 |
+
feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
|
92 |
+
hidden_dropout: float = field(
|
93 |
+
default=0.0,
|
94 |
+
metadata={
|
95 |
+
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
|
96 |
+
},
|
97 |
+
)
|
98 |
+
final_dropout: float = field(
|
99 |
+
default=0.0,
|
100 |
+
metadata={"help": "The dropout probability for the final projection layer."},
|
101 |
+
)
|
102 |
+
mask_time_prob: float = field(
|
103 |
+
default=0.05,
|
104 |
+
metadata={
|
105 |
+
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
|
106 |
+
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
|
107 |
+
"vectors will be masked along the time axis."
|
108 |
+
},
|
109 |
+
)
|
110 |
+
mask_time_length: int = field(
|
111 |
+
default=10,
|
112 |
+
metadata={"help": "Length of vector span to mask along the time axis."},
|
113 |
+
)
|
114 |
+
mask_feature_prob: float = field(
|
115 |
+
default=0.0,
|
116 |
+
metadata={
|
117 |
+
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
|
118 |
+
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
|
119 |
+
},
|
120 |
+
)
|
121 |
+
mask_feature_length: int = field(
|
122 |
+
default=10,
|
123 |
+
metadata={"help": "Length of vector span to mask along the feature axis."},
|
124 |
+
)
|
125 |
+
layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
|
126 |
+
ctc_loss_reduction: Optional[str] = field(
|
127 |
+
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
|
128 |
+
)
|
129 |
+
|
130 |
+
|
131 |
+
@dataclass
|
132 |
+
class DataTrainingArguments:
|
133 |
+
"""
|
134 |
+
Arguments pertaining to what data we are going to input our model for training and eval.
|
135 |
+
|
136 |
+
Using `HfArgumentParser` we can turn this class
|
137 |
+
into argparse arguments to be able to specify them on
|
138 |
+
the command line.
|
139 |
+
"""
|
140 |
+
|
141 |
+
dataset_name: List[str] = field(
|
142 |
+
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
143 |
+
)
|
144 |
+
dataset_config_name: List[str] = list_field(
|
145 |
+
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
146 |
+
)
|
147 |
+
train_split_name: List[str] = list_field(
|
148 |
+
default=["train+validation"],
|
149 |
+
metadata={
|
150 |
+
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
151 |
+
},
|
152 |
+
)
|
153 |
+
eval_split_name: List[str] = list_field(
|
154 |
+
default=["test"],
|
155 |
+
metadata={
|
156 |
+
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
157 |
+
},
|
158 |
+
)
|
159 |
+
audio_column_name: List[str] = list_field(
|
160 |
+
default=["audio"],
|
161 |
+
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
|
162 |
+
)
|
163 |
+
text_column_name: List[str] = list_field(
|
164 |
+
default=["text"],
|
165 |
+
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
|
166 |
+
)
|
167 |
+
overwrite_cache: bool = field(
|
168 |
+
default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
|
169 |
+
)
|
170 |
+
preprocessing_num_workers: Optional[int] = field(
|
171 |
+
default=None,
|
172 |
+
metadata={"help": "The number of processes to use for the preprocessing."},
|
173 |
+
)
|
174 |
+
max_train_samples: Optional[int] = field(
|
175 |
+
default=None,
|
176 |
+
metadata={
|
177 |
+
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
|
178 |
+
"value if set."
|
179 |
+
},
|
180 |
+
)
|
181 |
+
max_eval_samples: Optional[int] = field(
|
182 |
+
default=None,
|
183 |
+
metadata={
|
184 |
+
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
|
185 |
+
"value if set."
|
186 |
+
},
|
187 |
+
)
|
188 |
+
chars_to_ignore: Optional[List[str]] = list_field(
|
189 |
+
default=None,
|
190 |
+
metadata={"help": "A list of characters to remove from the transcripts."},
|
191 |
+
)
|
192 |
+
eval_metrics: List[str] = list_field(
|
193 |
+
default=["wer"],
|
194 |
+
metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
|
195 |
+
)
|
196 |
+
max_duration_in_seconds: float = field(
|
197 |
+
default=20.0,
|
198 |
+
metadata={
|
199 |
+
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
|
200 |
+
},
|
201 |
+
)
|
202 |
+
min_duration_in_seconds: float = field(
|
203 |
+
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
|
204 |
+
)
|
205 |
+
preprocessing_only: bool = field(
|
206 |
+
default=False,
|
207 |
+
metadata={
|
208 |
+
"help": "Whether to only do data preprocessing and skip training. "
|
209 |
+
"This is especially useful when data preprocessing errors out in distributed training due to timeout. "
|
210 |
+
"In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
|
211 |
+
"so that the cached datasets can consequently be loaded in distributed training"
|
212 |
+
},
|
213 |
+
)
|
214 |
+
use_auth_token: bool = field(
|
215 |
+
default=False,
|
216 |
+
metadata={
|
217 |
+
"help": "If :obj:`True`, will use the token generated when running"
|
218 |
+
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
|
219 |
+
},
|
220 |
+
)
|
221 |
+
unk_token: str = field(
|
222 |
+
default="[UNK]",
|
223 |
+
metadata={"help": "The unk token for the tokenizer"},
|
224 |
+
)
|
225 |
+
pad_token: str = field(
|
226 |
+
default="[PAD]",
|
227 |
+
metadata={"help": "The padding token for the tokenizer"},
|
228 |
+
)
|
229 |
+
word_delimiter_token: str = field(
|
230 |
+
default="|",
|
231 |
+
metadata={"help": "The word delimiter token for the tokenizer"},
|
232 |
+
)
|
233 |
+
phoneme_language: Optional[str] = field(
|
234 |
+
default=None,
|
235 |
+
metadata={
|
236 |
+
"help": "The target language that should be used be"
|
237 |
+
" passed to the tokenizer for tokenization. Note that"
|
238 |
+
" this is only relevant if the model classifies the"
|
239 |
+
" input audio to a sequence of phoneme sequences."
|
240 |
+
},
|
241 |
+
)
|
242 |
+
|
243 |
+
|
244 |
+
@dataclass
|
245 |
+
class DataCollatorCTCWithPadding:
|
246 |
+
"""
|
247 |
+
Data collator that will dynamically pad the inputs received.
|
248 |
+
Args:
|
249 |
+
processor (:class:`~transformers.AutoProcessor`)
|
250 |
+
The processor used for proccessing the data.
|
251 |
+
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
252 |
+
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
253 |
+
among:
|
254 |
+
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
255 |
+
sequence if provided).
|
256 |
+
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
257 |
+
maximum acceptable input length for the model if that argument is not provided.
|
258 |
+
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
259 |
+
different lengths).
|
260 |
+
max_length (:obj:`int`, `optional`):
|
261 |
+
Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
|
262 |
+
max_length_labels (:obj:`int`, `optional`):
|
263 |
+
Maximum length of the ``labels`` returned list and optionally padding length (see above).
|
264 |
+
pad_to_multiple_of (:obj:`int`, `optional`):
|
265 |
+
If set will pad the sequence to a multiple of the provided value.
|
266 |
+
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
|
267 |
+
7.5 (Volta).
|
268 |
+
"""
|
269 |
+
|
270 |
+
processor: AutoProcessor
|
271 |
+
padding: Union[bool, str] = "longest"
|
272 |
+
pad_to_multiple_of: Optional[int] = None
|
273 |
+
pad_to_multiple_of_labels: Optional[int] = None
|
274 |
+
|
275 |
+
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
276 |
+
# split inputs and labels since they have to be of different lenghts and need
|
277 |
+
# different padding methods
|
278 |
+
input_features = [{"input_values": feature["input_values"]} for feature in features]
|
279 |
+
label_features = [{"input_ids": feature["labels"]} for feature in features]
|
280 |
+
|
281 |
+
batch = self.processor.pad(
|
282 |
+
input_features,
|
283 |
+
padding=self.padding,
|
284 |
+
pad_to_multiple_of=self.pad_to_multiple_of,
|
285 |
+
return_tensors="pt",
|
286 |
+
)
|
287 |
+
|
288 |
+
with self.processor.as_target_processor():
|
289 |
+
labels_batch = self.processor.pad(
|
290 |
+
label_features,
|
291 |
+
padding=self.padding,
|
292 |
+
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
293 |
+
return_tensors="pt",
|
294 |
+
)
|
295 |
+
|
296 |
+
# replace padding with -100 to ignore loss correctly
|
297 |
+
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
298 |
+
|
299 |
+
batch["labels"] = labels
|
300 |
+
|
301 |
+
return batch
|
302 |
+
|
303 |
+
|
304 |
+
def create_vocabulary_from_data(
|
305 |
+
datasets: DatasetDict,
|
306 |
+
word_delimiter_token: Optional[str] = None,
|
307 |
+
unk_token: Optional[str] = None,
|
308 |
+
pad_token: Optional[str] = None,
|
309 |
+
):
|
310 |
+
# Given training and test labels create vocabulary
|
311 |
+
def extract_all_chars(batch):
|
312 |
+
all_text = " ".join(batch["target_text"])
|
313 |
+
vocab = list(set(all_text))
|
314 |
+
return {"vocab": [vocab], "all_text": [all_text]}
|
315 |
+
|
316 |
+
vocab_1 = set()
|
317 |
+
for string in datasets["train"]["target_text"]:
|
318 |
+
vocab_1.update(string.lower())
|
319 |
+
|
320 |
+
vocab_2 = set()
|
321 |
+
for string in datasets["eval"]["target_text"]:
|
322 |
+
vocab_2.update(string.lower())
|
323 |
+
|
324 |
+
# vocabs = datasets.map(
|
325 |
+
# extract_all_chars,
|
326 |
+
# batched=True,
|
327 |
+
# batch_size=-1,
|
328 |
+
# keep_in_memory=True,
|
329 |
+
# remove_columns=datasets["train"].column_names,
|
330 |
+
# desc="extract characters"
|
331 |
+
# )
|
332 |
+
|
333 |
+
# take union of all unique characters in each dataset
|
334 |
+
vocab_set = functools.reduce(
|
335 |
+
lambda vocab_1, vocab_2: vocab_1 | vocab_2, [vocab_1, vocab_2]
|
336 |
+
)
|
337 |
+
|
338 |
+
vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
|
339 |
+
|
340 |
+
# replace white space with delimiter token
|
341 |
+
if word_delimiter_token is not None:
|
342 |
+
vocab_dict[word_delimiter_token] = vocab_dict[" "]
|
343 |
+
del vocab_dict[" "]
|
344 |
+
|
345 |
+
# add unk and pad token
|
346 |
+
if unk_token is not None:
|
347 |
+
vocab_dict[unk_token] = len(vocab_dict)
|
348 |
+
|
349 |
+
if pad_token is not None:
|
350 |
+
vocab_dict[pad_token] = len(vocab_dict)
|
351 |
+
|
352 |
+
return vocab_dict
|
353 |
+
|
354 |
+
|
355 |
+
def main():
|
356 |
+
# See all possible arguments in src/transformers/training_args.py
|
357 |
+
# or by passing the --help flag to this script.
|
358 |
+
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
359 |
+
|
360 |
+
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
361 |
+
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
362 |
+
# If we pass only one argument to the script and it's the path to a json file,
|
363 |
+
# let's parse it to get our arguments.
|
364 |
+
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
365 |
+
else:
|
366 |
+
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
367 |
+
|
368 |
+
# Detecting last checkpoint.
|
369 |
+
last_checkpoint = None
|
370 |
+
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
371 |
+
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
372 |
+
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
|
373 |
+
raise ValueError(
|
374 |
+
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
|
375 |
+
"Use --overwrite_output_dir to overcome."
|
376 |
+
)
|
377 |
+
elif last_checkpoint is not None:
|
378 |
+
logger.info(
|
379 |
+
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
380 |
+
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
|
381 |
+
)
|
382 |
+
|
383 |
+
# Setup logging
|
384 |
+
logging.basicConfig(
|
385 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
386 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
387 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
388 |
+
)
|
389 |
+
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
|
390 |
+
|
391 |
+
# Log on each process the small summary:
|
392 |
+
logger.warning(
|
393 |
+
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
394 |
+
f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
395 |
+
)
|
396 |
+
# Set the verbosity to info of the Transformers logger (on main process only):
|
397 |
+
if is_main_process(training_args.local_rank):
|
398 |
+
transformers.utils.logging.set_verbosity_info()
|
399 |
+
logger.info("Training/evaluation parameters %s", training_args)
|
400 |
+
|
401 |
+
# Set seed before initializing model.
|
402 |
+
set_seed(training_args.seed)
|
403 |
+
|
404 |
+
# 1. First, let's load the dataset
|
405 |
+
raw_datasets = DatasetDict()
|
406 |
+
train_datasets = []
|
407 |
+
eval_datasests = []
|
408 |
+
if training_args.do_train:
|
409 |
+
print(data_args.dataset_name,data_args.dataset_config_name, data_args.train_split_name, data_args.audio_column_name, data_args.text_column_name)
|
410 |
+
for dataset_name, dataset_config_name, train_split_name, audio_column_name, text_column_name in zip(data_args.dataset_name, data_args.dataset_config_name, data_args.train_split_name, data_args.audio_column_name, data_args.text_column_name):
|
411 |
+
raw_datasets["train"] = load_dataset(
|
412 |
+
dataset_name,
|
413 |
+
dataset_config_name,
|
414 |
+
split=train_split_name,
|
415 |
+
use_auth_token=data_args.use_auth_token,
|
416 |
+
data_dir="datasets",
|
417 |
+
cache_dir="datasets"
|
418 |
+
)
|
419 |
+
|
420 |
+
if audio_column_name not in raw_datasets["train"].column_names:
|
421 |
+
raise ValueError(
|
422 |
+
f"--audio_column_name '{audio_column_name}' not found in dataset '{dataset_name}'. "
|
423 |
+
"Make sure to set `--audio_column_name` to the correct audio column - one of "
|
424 |
+
f"{', '.join(raw_datasets['train'].column_names)}."
|
425 |
+
)
|
426 |
+
|
427 |
+
if text_column_name not in raw_datasets["train"].column_names:
|
428 |
+
raise ValueError(
|
429 |
+
f"--text_column_name {text_column_name} not found in dataset '{dataset_name}'. "
|
430 |
+
"Make sure to set `--text_column_name` to the correct text column - one of "
|
431 |
+
f"{', '.join(raw_datasets['train'].column_names)}."
|
432 |
+
)
|
433 |
+
|
434 |
+
if data_args.max_train_samples is not None:
|
435 |
+
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
|
436 |
+
if text_column_name != "text":
|
437 |
+
raw_datasets["train"] = raw_datasets["train"].rename_column(text_column_name, "text")
|
438 |
+
if audio_column_name != "audio":
|
439 |
+
raw_datasets["train"] = raw_datasets["train"].rename_column(audio_column_name, "audio")
|
440 |
+
raw_datasets["train"] = raw_datasets["train"].remove_columns([column for column in raw_datasets["train"].column_names if column not in ["text", "audio"]])
|
441 |
+
train_datasets.append(raw_datasets["train"])
|
442 |
+
raw_datasets["train"] = concatenate_datasets(train_datasets)
|
443 |
+
|
444 |
+
if training_args.do_eval:
|
445 |
+
for dataset_name, dataset_config_name, eval_split_name, audio_column_name, text_column_name in zip(data_args.dataset_name, data_args.dataset_config_name, data_args.eval_split_name, data_args.audio_column_name, data_args.text_column_name):
|
446 |
+
raw_datasets["eval"] = load_dataset(
|
447 |
+
dataset_name,
|
448 |
+
dataset_config_name,
|
449 |
+
split=eval_split_name,
|
450 |
+
use_auth_token=data_args.use_auth_token,
|
451 |
+
data_dir="datasets",
|
452 |
+
cache_dir="datasets"
|
453 |
+
)
|
454 |
+
|
455 |
+
if audio_column_name not in raw_datasets["eval"].column_names:
|
456 |
+
raise ValueError(
|
457 |
+
f"--audio_column_name '{audio_column_name}' not found in dataset '{dataset_name}'. "
|
458 |
+
"Make sure to set `--audio_column_name` to the correct audio column - one of "
|
459 |
+
f"{', '.join(raw_datasets['eval'].column_names)}."
|
460 |
+
)
|
461 |
+
|
462 |
+
if text_column_name not in raw_datasets["eval"].column_names:
|
463 |
+
raise ValueError(
|
464 |
+
f"--text_column_name {text_column_name} not found in dataset '{dataset_name}'. "
|
465 |
+
"Make sure to set `--text_column_name` to the correct text column - one of "
|
466 |
+
f"{', '.join(raw_datasets['eval'].column_names)}."
|
467 |
+
)
|
468 |
+
|
469 |
+
if data_args.max_eval_samples is not None:
|
470 |
+
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
|
471 |
+
if text_column_name != "text":
|
472 |
+
raw_datasets["eval"] = raw_datasets["eval"].rename_column(text_column_name, "text")
|
473 |
+
if audio_column_name != "audio":
|
474 |
+
raw_datasets["eval"] = raw_datasets["eval"].rename_column(audio_column_name, "audio")
|
475 |
+
raw_datasets["eval"] = raw_datasets["eval"].remove_columns([column for column in raw_datasets["eval"].column_names if column not in ["text", "audio"]])
|
476 |
+
eval_datasests.append(raw_datasets["eval"])
|
477 |
+
raw_datasets["eval"] = concatenate_datasets(eval_datasests)
|
478 |
+
|
479 |
+
# 2. We remove some special characters from the datasets
|
480 |
+
# that make training complicated and do not help in transcribing the speech
|
481 |
+
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
482 |
+
# that could be easily picked up by the model
|
483 |
+
chars_to_ignore_regex = (
|
484 |
+
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
|
485 |
+
)
|
486 |
+
text_column_name = "text"
|
487 |
+
|
488 |
+
def normalize_numbers(batch):
|
489 |
+
text = batch["text"]
|
490 |
+
text = normalize_numbers_ca(text)
|
491 |
+
batch["text"] = text.lower()
|
492 |
+
return batch
|
493 |
+
|
494 |
+
with training_args.main_process_first(desc="dataset verbalize numbers"):
|
495 |
+
raw_datasets = raw_datasets.map(
|
496 |
+
normalize_numbers,
|
497 |
+
desc="remove special characters from datasets",
|
498 |
+
)
|
499 |
+
|
500 |
+
def remove_special_characters(batch):
|
501 |
+
if chars_to_ignore_regex is not None:
|
502 |
+
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
|
503 |
+
batch["target_text"] = re.sub("á", "a", batch["target_text"])
|
504 |
+
batch["target_text"] = re.sub("ñ", "ny", batch["target_text"])
|
505 |
+
else:
|
506 |
+
batch["target_text"] = batch[text_column_name].lower() + " "
|
507 |
+
return batch
|
508 |
+
|
509 |
+
with training_args.main_process_first(desc="dataset map special characters removal"):
|
510 |
+
raw_datasets = raw_datasets.map(
|
511 |
+
remove_special_characters,
|
512 |
+
remove_columns=[text_column_name],
|
513 |
+
desc="remove special characters from datasets",
|
514 |
+
)
|
515 |
+
|
516 |
+
set_characters = set()
|
517 |
+
for string in raw_datasets["train"]["target_text"]:
|
518 |
+
set_characters.update(string.lower())
|
519 |
+
|
520 |
+
vocab = [character for character in "aàbcçdeéèfghiíïjklmnoóòpqrstuúüvwxyz'·-"]
|
521 |
+
|
522 |
+
unwanted_chars = set_characters-set(vocab)-set([' '])
|
523 |
+
|
524 |
+
with training_args.main_process_first(desc="dataset filter non vocab chars"):
|
525 |
+
raw_datasets = raw_datasets.filter(
|
526 |
+
lambda example: not any((c in unwanted_chars) for c in example),
|
527 |
+
input_columns="target_text",
|
528 |
+
desc="remove examples with weird characters"
|
529 |
+
)
|
530 |
+
|
531 |
+
# save special tokens for tokenizer
|
532 |
+
word_delimiter_token = data_args.word_delimiter_token
|
533 |
+
unk_token = data_args.unk_token
|
534 |
+
pad_token = data_args.pad_token
|
535 |
+
|
536 |
+
# 3. Next, let's load the config as we might need it to create
|
537 |
+
# the tokenizer
|
538 |
+
# load config
|
539 |
+
config = AutoConfig.from_pretrained(
|
540 |
+
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
|
541 |
+
)
|
542 |
+
|
543 |
+
# 4. Next, if no tokenizer file is defined,
|
544 |
+
# we create the vocabulary of the model by extracting all unique characters from
|
545 |
+
# the training and evaluation datasets
|
546 |
+
# We need to make sure that only first rank saves vocabulary
|
547 |
+
# make sure all processes wait until vocab is created
|
548 |
+
tokenizer_name_or_path = model_args.tokenizer_name_or_path
|
549 |
+
tokenizer_kwargs = {}
|
550 |
+
if tokenizer_name_or_path is None:
|
551 |
+
# save vocab in training output dir
|
552 |
+
tokenizer_name_or_path = training_args.output_dir
|
553 |
+
|
554 |
+
vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
|
555 |
+
|
556 |
+
with training_args.main_process_first():
|
557 |
+
if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
|
558 |
+
os.remove(vocab_file)
|
559 |
+
|
560 |
+
with training_args.main_process_first(desc="dataset map vocabulary creation"):
|
561 |
+
if not os.path.isfile(vocab_file):
|
562 |
+
os.makedirs(tokenizer_name_or_path, exist_ok=True)
|
563 |
+
vocab_dict = create_vocabulary_from_data(
|
564 |
+
raw_datasets,
|
565 |
+
word_delimiter_token=word_delimiter_token,
|
566 |
+
unk_token=unk_token,
|
567 |
+
pad_token=pad_token,
|
568 |
+
)
|
569 |
+
|
570 |
+
# save vocab dict to be loaded into tokenizer
|
571 |
+
with open(vocab_file, "w") as file:
|
572 |
+
json.dump(vocab_dict, file)
|
573 |
+
|
574 |
+
# if tokenizer has just been created
|
575 |
+
# it is defined by `tokenizer_class` if present in config else by `model_type`
|
576 |
+
tokenizer_kwargs = {
|
577 |
+
"config": config if config.tokenizer_class is not None else None,
|
578 |
+
"tokenizer_type": config.model_type if config.tokenizer_class is None else None,
|
579 |
+
"unk_token": unk_token,
|
580 |
+
"pad_token": pad_token,
|
581 |
+
"word_delimiter_token": word_delimiter_token,
|
582 |
+
}
|
583 |
+
|
584 |
+
# 5. Now we can instantiate the feature extractor, tokenizer and model
|
585 |
+
# Note for distributed training, the .from_pretrained methods guarantee that only
|
586 |
+
# one local process can concurrently download model & vocab.
|
587 |
+
|
588 |
+
# load feature_extractor and tokenizer
|
589 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
590 |
+
tokenizer_name_or_path,
|
591 |
+
use_auth_token=data_args.use_auth_token,
|
592 |
+
**tokenizer_kwargs,
|
593 |
+
)
|
594 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
595 |
+
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
|
596 |
+
)
|
597 |
+
|
598 |
+
# adapt config
|
599 |
+
config.update(
|
600 |
+
{
|
601 |
+
"feat_proj_dropout": model_args.feat_proj_dropout,
|
602 |
+
"attention_dropout": model_args.attention_dropout,
|
603 |
+
"hidden_dropout": model_args.hidden_dropout,
|
604 |
+
"final_dropout": model_args.final_dropout,
|
605 |
+
"mask_time_prob": model_args.mask_time_prob,
|
606 |
+
"mask_time_length": model_args.mask_time_length,
|
607 |
+
"mask_feature_prob": model_args.mask_feature_prob,
|
608 |
+
"mask_feature_length": model_args.mask_feature_length,
|
609 |
+
"gradient_checkpointing": training_args.gradient_checkpointing,
|
610 |
+
"layerdrop": model_args.layerdrop,
|
611 |
+
"ctc_loss_reduction": model_args.ctc_loss_reduction,
|
612 |
+
"pad_token_id": tokenizer.pad_token_id,
|
613 |
+
"vocab_size": len(tokenizer),
|
614 |
+
"activation_dropout": model_args.activation_dropout,
|
615 |
+
}
|
616 |
+
)
|
617 |
+
|
618 |
+
# create model
|
619 |
+
model = AutoModelForCTC.from_pretrained(
|
620 |
+
model_args.model_name_or_path,
|
621 |
+
cache_dir=model_args.cache_dir,
|
622 |
+
config=config,
|
623 |
+
use_auth_token=data_args.use_auth_token,
|
624 |
+
)
|
625 |
+
|
626 |
+
# freeze encoder
|
627 |
+
if model_args.freeze_feature_encoder:
|
628 |
+
model.freeze_feature_encoder()
|
629 |
+
|
630 |
+
# 6. Now we preprocess the datasets including loading the audio, resampling and normalization
|
631 |
+
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
|
632 |
+
# so that we just need to set the correct target sampling rate and normalize the input
|
633 |
+
# via the `feature_extractor`
|
634 |
+
|
635 |
+
# make sure that dataset decodes audio with correct sampling rate
|
636 |
+
dataset_sampling_rate = next(iter(raw_datasets.values())).features["audio"].sampling_rate
|
637 |
+
# if dataset_sampling_rate != feature_extractor.sampling_rate:
|
638 |
+
raw_datasets = raw_datasets.cast_column(
|
639 |
+
"audio", datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
|
640 |
+
)
|
641 |
+
|
642 |
+
# derive max & min input length for sample rate & max duration
|
643 |
+
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
|
644 |
+
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
|
645 |
+
audio_column_name = "audio"
|
646 |
+
num_workers = data_args.preprocessing_num_workers
|
647 |
+
|
648 |
+
# `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
|
649 |
+
phoneme_language = data_args.phoneme_language
|
650 |
+
|
651 |
+
# Preprocessing the datasets.
|
652 |
+
# We need to read the audio files as arrays and tokenize the targets.
|
653 |
+
def prepare_dataset(batch):
|
654 |
+
# load audio
|
655 |
+
sample = batch[audio_column_name]
|
656 |
+
|
657 |
+
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
|
658 |
+
batch["input_values"] = inputs.input_values[0]
|
659 |
+
batch["input_length"] = len(batch["input_values"])
|
660 |
+
|
661 |
+
# encode targets
|
662 |
+
additional_kwargs = {}
|
663 |
+
if phoneme_language is not None:
|
664 |
+
additional_kwargs["phonemizer_lang"] = phoneme_language
|
665 |
+
|
666 |
+
batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
|
667 |
+
return batch
|
668 |
+
|
669 |
+
raw_datasets = raw_datasets.shuffle(seed=42)
|
670 |
+
|
671 |
+
with training_args.main_process_first(desc="dataset map preprocessing"):
|
672 |
+
vectorized_datasets = raw_datasets.map(
|
673 |
+
prepare_dataset,
|
674 |
+
remove_columns=next(iter(raw_datasets.values())).column_names,
|
675 |
+
num_proc=num_workers,
|
676 |
+
desc="preprocess datasets",
|
677 |
+
)
|
678 |
+
|
679 |
+
def is_audio_in_length_range(length):
|
680 |
+
return length > min_input_length and length < max_input_length
|
681 |
+
|
682 |
+
# filter data that is shorter than min_input_length
|
683 |
+
vectorized_datasets = vectorized_datasets.filter(
|
684 |
+
is_audio_in_length_range,
|
685 |
+
num_proc=num_workers,
|
686 |
+
input_columns=["input_length"],
|
687 |
+
)
|
688 |
+
|
689 |
+
# 7. Next, we can prepare the training.
|
690 |
+
# Let's use word error rate (WER) as our evaluation metric,
|
691 |
+
# instantiate a data collator and the trainer
|
692 |
+
|
693 |
+
# Define evaluation metrics during training, *i.e.* word error rate, character error rate
|
694 |
+
eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
|
695 |
+
|
696 |
+
# for large datasets it is advised to run the preprocessing on a
|
697 |
+
# single machine first with ``args.preprocessing_only`` since there will mostly likely
|
698 |
+
# be a timeout when running the script in distributed mode.
|
699 |
+
# In a second step ``args.preprocessing_only`` can then be set to `False` to load the
|
700 |
+
# cached dataset
|
701 |
+
if data_args.preprocessing_only:
|
702 |
+
logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
|
703 |
+
return
|
704 |
+
|
705 |
+
def compute_metrics(pred):
|
706 |
+
pred_logits = pred.predictions
|
707 |
+
pred_ids = np.argmax(pred_logits, axis=-1)
|
708 |
+
|
709 |
+
pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
|
710 |
+
|
711 |
+
pred_str = tokenizer.batch_decode(pred_ids)
|
712 |
+
# we do not want to group tokens when computing the metrics
|
713 |
+
label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
|
714 |
+
|
715 |
+
metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
|
716 |
+
|
717 |
+
return metrics
|
718 |
+
|
719 |
+
# Now save everything to be able to create a single processor later
|
720 |
+
if is_main_process(training_args.local_rank):
|
721 |
+
# save feature extractor, tokenizer and config
|
722 |
+
feature_extractor.save_pretrained(training_args.output_dir)
|
723 |
+
tokenizer.save_pretrained(training_args.output_dir)
|
724 |
+
config.save_pretrained(training_args.output_dir)
|
725 |
+
|
726 |
+
try:
|
727 |
+
processor = AutoProcessor.from_pretrained(training_args.output_dir)
|
728 |
+
except (OSError, KeyError):
|
729 |
+
warnings.warn(
|
730 |
+
"Loading a processor from a feature extractor config that does not"
|
731 |
+
" include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
|
732 |
+
" attribute to your `preprocessor_config.json` file to suppress this warning: "
|
733 |
+
" `'processor_class': 'Wav2Vec2Processor'`",
|
734 |
+
FutureWarning,
|
735 |
+
)
|
736 |
+
processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
|
737 |
+
|
738 |
+
# Instantiate custom data collator
|
739 |
+
data_collator = DataCollatorCTCWithPadding(processor=processor)
|
740 |
+
|
741 |
+
# Initialize Trainer
|
742 |
+
trainer = Trainer(
|
743 |
+
model=model,
|
744 |
+
data_collator=data_collator,
|
745 |
+
args=training_args,
|
746 |
+
compute_metrics=compute_metrics,
|
747 |
+
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
|
748 |
+
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
|
749 |
+
tokenizer=feature_extractor,
|
750 |
+
)
|
751 |
+
|
752 |
+
# 8. Finally, we can start training
|
753 |
+
|
754 |
+
# Training
|
755 |
+
if training_args.do_train:
|
756 |
+
|
757 |
+
# use last checkpoint if exist
|
758 |
+
if last_checkpoint is not None:
|
759 |
+
checkpoint = last_checkpoint
|
760 |
+
elif os.path.isdir(model_args.model_name_or_path):
|
761 |
+
checkpoint = model_args.model_name_or_path
|
762 |
+
else:
|
763 |
+
checkpoint = None
|
764 |
+
|
765 |
+
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
766 |
+
trainer.save_model()
|
767 |
+
|
768 |
+
metrics = train_result.metrics
|
769 |
+
max_train_samples = (
|
770 |
+
data_args.max_train_samples
|
771 |
+
if data_args.max_train_samples is not None
|
772 |
+
else len(vectorized_datasets["train"])
|
773 |
+
)
|
774 |
+
metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
|
775 |
+
|
776 |
+
trainer.log_metrics("train", metrics)
|
777 |
+
trainer.save_metrics("train", metrics)
|
778 |
+
trainer.save_state()
|
779 |
+
|
780 |
+
# Evaluation
|
781 |
+
results = {}
|
782 |
+
if training_args.do_eval:
|
783 |
+
logger.info("*** Evaluate ***")
|
784 |
+
metrics = trainer.evaluate()
|
785 |
+
max_eval_samples = (
|
786 |
+
data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
|
787 |
+
)
|
788 |
+
metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
|
789 |
+
|
790 |
+
trainer.log_metrics("eval", metrics)
|
791 |
+
trainer.save_metrics("eval", metrics)
|
792 |
+
|
793 |
+
# Write model card and (optionally) push to hub
|
794 |
+
config_name = data_args.dataset_config_name[0] if data_args.dataset_config_name is not None else "na"
|
795 |
+
kwargs = {
|
796 |
+
"finetuned_from": model_args.model_name_or_path,
|
797 |
+
"tasks": "speech-recognition",
|
798 |
+
"tags": ["automatic-speech-recognition"]+data_args.dataset_name[0],
|
799 |
+
"dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name[0]}, Eval split: {data_args.eval_split_name[0]}",
|
800 |
+
"dataset": f"{data_args.dataset_name[0].upper()} - {config_name.upper()}",
|
801 |
+
}
|
802 |
+
if "common_voice" in data_args.dataset_name[0]:
|
803 |
+
kwargs["language"] = config_name
|
804 |
+
|
805 |
+
if training_args.push_to_hub:
|
806 |
+
trainer.push_to_hub(**kwargs)
|
807 |
+
else:
|
808 |
+
trainer.create_model_card(**kwargs)
|
809 |
+
|
810 |
+
return results
|
811 |
+
|
812 |
+
|
813 |
+
if __name__ == "__main__":
|
814 |
+
main()
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
text/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2017 Keith Ito
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4 |
+
of this software and associated documentation files (the "Software"), to deal
|
5 |
+
in the Software without restriction, including without limitation the rights
|
6 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7 |
+
copies of the Software, and to permit persons to whom the Software is
|
8 |
+
furnished to do so, subject to the following conditions:
|
9 |
+
|
10 |
+
The above copyright notice and this permission notice shall be included in
|
11 |
+
all copies or substantial portions of the Software.
|
12 |
+
|
13 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19 |
+
THE SOFTWARE.
|
text/__init__.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
import re
|
3 |
+
from text import cleaners
|
4 |
+
from text.symbols import symbols
|
5 |
+
|
6 |
+
|
7 |
+
# Mappings from symbol to numeric ID and vice versa:
|
8 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
9 |
+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
10 |
+
|
11 |
+
# Regular expression matching text enclosed in curly braces:
|
12 |
+
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
13 |
+
|
14 |
+
|
15 |
+
def text_to_sequence(text, cleaner_names):
|
16 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
17 |
+
|
18 |
+
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
19 |
+
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
20 |
+
|
21 |
+
Args:
|
22 |
+
text: string to convert to a sequence
|
23 |
+
cleaner_names: names of the cleaner functions to run the text through
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
List of integers corresponding to the symbols in the text
|
27 |
+
'''
|
28 |
+
sequence = []
|
29 |
+
|
30 |
+
# Check for curly braces and treat their contents as ARPAbet:
|
31 |
+
while len(text):
|
32 |
+
m = _curly_re.match(text)
|
33 |
+
if not m:
|
34 |
+
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
35 |
+
break
|
36 |
+
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
37 |
+
sequence += _arpabet_to_sequence(m.group(2))
|
38 |
+
text = m.group(3)
|
39 |
+
|
40 |
+
return sequence
|
41 |
+
|
42 |
+
|
43 |
+
def sequence_to_text(sequence):
|
44 |
+
'''Converts a sequence of IDs back to a string'''
|
45 |
+
result = ''
|
46 |
+
for symbol_id in sequence:
|
47 |
+
if symbol_id in _id_to_symbol:
|
48 |
+
s = _id_to_symbol[symbol_id]
|
49 |
+
# Enclose ARPAbet back in curly braces:
|
50 |
+
if len(s) > 1 and s[0] == '@':
|
51 |
+
s = '{%s}' % s[1:]
|
52 |
+
result += s
|
53 |
+
return result.replace('}{', ' ')
|
54 |
+
|
55 |
+
|
56 |
+
def _clean_text(text, cleaner_names):
|
57 |
+
for name in cleaner_names:
|
58 |
+
cleaner = getattr(cleaners, name)
|
59 |
+
if not cleaner:
|
60 |
+
raise Exception('Unknown cleaner: %s' % name)
|
61 |
+
text = cleaner(text)
|
62 |
+
return text
|
63 |
+
|
64 |
+
|
65 |
+
def _symbols_to_sequence(symbols):
|
66 |
+
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
67 |
+
|
68 |
+
|
69 |
+
def _arpabet_to_sequence(text):
|
70 |
+
return _symbols_to_sequence(['@' + s for s in text.split()])
|
71 |
+
|
72 |
+
|
73 |
+
def _should_keep_symbol(s):
|
74 |
+
return s in _symbol_to_id and s is not '_' and s is not '~'
|
text/ca.sor
ADDED
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
^0 zero
|
2 |
+
1$ u
|
3 |
+
1 un
|
4 |
+
2 dos
|
5 |
+
3 tres
|
6 |
+
4 quatre
|
7 |
+
5 cinc
|
8 |
+
6 sis
|
9 |
+
7 set
|
10 |
+
8 huit # [:ca-valencia:] [:ca-ES-valencia:]
|
11 |
+
8 vuit
|
12 |
+
9 nou
|
13 |
+
#10-19
|
14 |
+
10 deu
|
15 |
+
11 onze
|
16 |
+
12 dotze
|
17 |
+
13 tretze
|
18 |
+
14 catorze
|
19 |
+
15 quinze
|
20 |
+
16 setze
|
21 |
+
17 dèsset # [:ca-valencia:] [:ca-ES-valencia:]
|
22 |
+
17 desset # [:ca-balear:] [:ca-ES-balear:]
|
23 |
+
17 disset
|
24 |
+
18 devuit # [:ca-balear:] [:ca-ES-balear:]
|
25 |
+
18 díhuit # [:ca-valencia:] [:ca-ES-valencia:]
|
26 |
+
19 denou # [:ca-balear:] [:ca-ES-balear:]
|
27 |
+
19 dènou # [:ca-valencia:] [:ca-ES-valencia:]
|
28 |
+
1(\d) di$1
|
29 |
+
# 20-29
|
30 |
+
20 vint
|
31 |
+
2(\d) vint-i-$1
|
32 |
+
# 30, 40, 50, 60, 70, 80, 90
|
33 |
+
30 trenta
|
34 |
+
40 quaranta
|
35 |
+
50 cinquanta
|
36 |
+
60 seixanta
|
37 |
+
70 setanta
|
38 |
+
80 huitanta # [:ca-valencia:] [:ca-ES-valencia:]
|
39 |
+
80 vuitanta
|
40 |
+
90 noranta
|
41 |
+
(\d)(\d) $(\10)-$2
|
42 |
+
|
43 |
+
#100-199
|
44 |
+
100 cent
|
45 |
+
1(\d\d) cent $1
|
46 |
+
#200-999
|
47 |
+
(\d)00 $1-cents
|
48 |
+
(\d)(\d\d) $1-cents $2
|
49 |
+
|
50 |
+
#1000-1999
|
51 |
+
1000 mil
|
52 |
+
1(\d{3}) mil $1
|
53 |
+
|
54 |
+
#2000-999999
|
55 |
+
(\d{1,3})000 $1 mil
|
56 |
+
(\d{1,3})(\d{3}) $1 mil $2
|
57 |
+
|
58 |
+
# our limit is number <10^606
|
59 |
+
(\d{606,}) ""
|
60 |
+
|
61 |
+
# x-lions
|
62 |
+
# 10000000=10^6 -> un milió
|
63 |
+
1((0{6})+) un $(pre:$(count:\1))lió
|
64 |
+
1((\d{6})+) un $(pre:$(count:\1))lió $1
|
65 |
+
# 2000000=2·10^6 -> dos milions
|
66 |
+
(\d{1,6})((0{6})+) $1 $(pre:$(count:\2))lions
|
67 |
+
(\d{1,6})((\d{6})+) $1 $(pre:$(count:\2))lions $2
|
68 |
+
|
69 |
+
|
70 |
+
# count number of 10^6, usefull for x-lions, and x-liards prefixes.
|
71 |
+
count:.{0,5}? 0
|
72 |
+
count:.{6}.{0,5} 1
|
73 |
+
count:(.{12}).{0,5} 2
|
74 |
+
count:(.{18}).{0,5} 3
|
75 |
+
count:(.{24}).{0,5} 4
|
76 |
+
count:(.{30}).{0,5} 5
|
77 |
+
count:(.{36}).{0,5} 6
|
78 |
+
count:(.{42}).{0,5} 7
|
79 |
+
count:(.{48}).{0,5} 8
|
80 |
+
count:(.{54}).{0,5} 9
|
81 |
+
count:(.{60})(.{0,59}) 1|$(count:\2)
|
82 |
+
count:(.{120})(.{0,59}) 2|$(count:\2)
|
83 |
+
count:(.{180})(.{0,59}) 3|$(count:\2)
|
84 |
+
count:(.{240})(.{0,59}) 4|$(count:\2)
|
85 |
+
count:(.{300})(.{0,59}) 5|$(count:\2)
|
86 |
+
count:(.{360})(.{0,59}) 6|$(count:\2)
|
87 |
+
count:(.{420})(.{0,59}) 7|$(count:\2)
|
88 |
+
count:(.{480})(.{0,59}) 8|$(count:\2)
|
89 |
+
count:(.{540})(.{0,59}) 9|$(count:\2)
|
90 |
+
count:(.{600})(.{0,5}) 10|$(count:\2) # our limit is 10^606-1
|
91 |
+
|
92 |
+
# prefixes needed for x-lions and x-liards, up to 10^606-1
|
93 |
+
pre:1 mi
|
94 |
+
pre:2 bi
|
95 |
+
pre:3 tri
|
96 |
+
pre:4 quadri
|
97 |
+
pre:5 quinti
|
98 |
+
pre:6 sexti
|
99 |
+
pre:7 septi
|
100 |
+
pre:8 octi
|
101 |
+
pre:9 noni
|
102 |
+
pre:10 deci
|
103 |
+
pre:1(\d) $(pre2:\1)|deci
|
104 |
+
pre:(\d)0 $(pre3:\1)
|
105 |
+
pre:(\d)(\d) $(pre2:\2)|$(pre3:\1)
|
106 |
+
pre:100 centi
|
107 |
+
|
108 |
+
pre2:1 uno
|
109 |
+
pre2:2 duo
|
110 |
+
pre2:3 tre
|
111 |
+
pre2:4 quattour
|
112 |
+
pre2:5 quin
|
113 |
+
pre2:6 sex
|
114 |
+
pre2:7 septen
|
115 |
+
pre2:8 octo
|
116 |
+
pre2:9 novem
|
117 |
+
|
118 |
+
pre3:1 deci
|
119 |
+
pre3:2 viginti
|
120 |
+
pre3:3 triginti
|
121 |
+
pre3:4 quadraginti
|
122 |
+
pre3:5 quinquaginti
|
123 |
+
pre3:6 sexaginti
|
124 |
+
pre3:7 septuaginti
|
125 |
+
pre3:8 octoginti
|
126 |
+
pre3:9 nonoginti
|
127 |
+
pre3:10 centi
|
128 |
+
|
129 |
+
# negative number
|
130 |
+
[--](\d+) menys |$1
|
131 |
+
|
132 |
+
# decimals
|
133 |
+
"([^,]*\d)[.]((\d{3})+)([,][^,.]*)?" $(\1\2\4)
|
134 |
+
"([--]?\d+)([,]0*)?" $1
|
135 |
+
"([--]?\d+)[,](\d*)" $(\1·\2)
|
136 |
+
"([--]?\d+·0*)([^0]00?)0*" $1| |$2
|
137 |
+
"([--]?\d+·0*)([^0])" $1| |$2
|
138 |
+
"([--]?\d+·0*)([^0]\d)" $1| |$2
|
139 |
+
"([--]?\d+·0*)([^0]\d\d)" $1| |$2
|
140 |
+
"([--]?\d+·0*)([^0]\d\d)0*" $1| |$2
|
141 |
+
|
142 |
+
"([--]?\d+·0*)(([^0]|[^0]\d*[^0]))0*" $1| $(read:\2)
|
143 |
+
"([--]?\d+)·(\d*)(\d)" $(\1·\2)| |$3
|
144 |
+
"([--]?\d+)·" $1| coma
|
145 |
+
|
146 |
+
# used for decimal part
|
147 |
+
#read:(\d*[^0])0*$ $(read:\1)
|
148 |
+
read:(\d*[1-9])(00+)([1-9]\d*) $(read:\1)| |$(read:\2) |$(read:\3)
|
149 |
+
read:(\d$) $1
|
150 |
+
read:0(\d+) $(read:0)| |$(read:\1)
|
151 |
+
read:([1-9]\d) $1
|
152 |
+
read:([1-9]\d\d) $1
|
153 |
+
read:(\d\d\d) $1
|
154 |
+
read:(\d\d)((\d\d)+) $(read:\1)| |$(read:\2)
|
155 |
+
read:(\d\d)((\d\d)*)(\d\d\d) $(read:\1)| |$(read:\2)| |$(read:\4)
|
156 |
+
|
157 |
+
|
158 |
+
# convert masculine forms to feminine forms
|
159 |
+
# it can be run after: standard number conversion; and after ordinal, partitive functions.
|
160 |
+
## runned with feminine function.
|
161 |
+
f:(.*iliard)(.*) \1$(f:\2) # convert only <1,000,000,000
|
162 |
+
f:(.*ili)(.*) \1$(f:\2) # convert only <100,0000
|
163 |
+
f:(.*d)o(s[^èé]*) $(f:\1ue\2) # 2 -> dos -> dues
|
164 |
+
f:(.*cent)(s.*) $(f:\1e\2) # cents -> centes
|
165 |
+
f:(((.*)[^a-zèé]|))u$ \1una # vint-i-u -> vint-i-una
|
166 |
+
## runned after ord function.
|
167 |
+
f:(.*[^0-9])n$ \1na # segon -> segona
|
168 |
+
f:(.*[^0-9]r)$ \1a # tercer -> tercera
|
169 |
+
f:(.*[^0-9]r)t$ \1ta # quart -> quarta
|
170 |
+
f:(.*[^0-9])è$ \1ena # sisè -> sisena
|
171 |
+
f:(.*[^0-9])é$ \1ena # sisé -> sisena
|
172 |
+
## runned after ord2 function.
|
173 |
+
f:(.*[0-9])[nrtè]$ \1a # 2n -> 2a
|
174 |
+
## runnded after part function.
|
175 |
+
f:(.*ter)ç$ \1cera # terç -> tercera
|
176 |
+
f:(.*è[sc]i)m$ \1ma # milionèsim -> milionèsima
|
177 |
+
f:(.*[^0-9]i)g$ \1tja # mig -> mitja
|
178 |
+
|
179 |
+
|
180 |
+
no-centes:(.*)centes(.*) \1cents\2
|
181 |
+
no-centes:(.*) \1
|
182 |
+
|
183 |
+
# convert ordinal numbers (1st, 2nd, 3rd,... nth) to partitive (1, 1/2, 1/3, .... 1/n)
|
184 |
+
p:(.*)primer$ \1unitat
|
185 |
+
p:(.*)segon$ \1mig
|
186 |
+
p:(.*)tercer$ \1terç
|
187 |
+
p:(.*quart)$ \1
|
188 |
+
p:(.*)des[èé]$ \1dècim
|
189 |
+
p:((.*)cent)[èé]$ \1èsim
|
190 |
+
p:((.*)mil)[èé]$ \1·lèsim
|
191 |
+
p:((.*)ilion)[èé]$ \1èsim
|
192 |
+
p:((.*)iliard)[èé]$ \1èsim
|
193 |
+
|
194 |
+
|
195 |
+
# fallback, ignore 1-letter not-defined fuctions
|
196 |
+
.:(.*) \1
|
197 |
+
|
198 |
+
# runned after ordinal and partitive fuctions
|
199 |
+
pl:(.*[^\d][nrtnec])$ \1s
|
200 |
+
pl:(.*[^\d])ig$ \1igs # mig -> mitjos
|
201 |
+
pl:(.*[^\d])ja$ \1ges
|
202 |
+
pl:(.*[^\d])a$ \1es
|
203 |
+
pl:(.*[^\d])[èé]$ \1ens
|
204 |
+
# after ord2: 1r->1rs, 2n->2ns, 5è->5ns, ...
|
205 |
+
pl:(\d+[rnrt])$ \1s # 1r -> 1rs, 2n -> 2ns, 4t -> 4ts
|
206 |
+
pl:(\d+)[èé]$ \1ns # 5è -> 5ns
|
207 |
+
pl:(\d+)a$ \1es # 2a -> 2es
|
208 |
+
# after partitive
|
209 |
+
pl:([^[0-9]*[sç])$ \1os # dos -> dosos, terç > terços
|
210 |
+
pl:([^[0-9]*è[sc]im)$ \1s # dècim -> dècims
|
211 |
+
#fallback
|
212 |
+
pl:(.*) \1
|
213 |
+
|
214 |
+
|
215 |
+
# unit/subunit singular/plural
|
216 |
+
# million or greater part of the number name separated by "ili" pattern
|
217 |
+
# before masculine to feminine conversion
|
218 |
+
us(.).:([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \2
|
219 |
+
up(.).:([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \3
|
220 |
+
ud(.).:([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \4
|
221 |
+
ss.(.):([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \5
|
222 |
+
sp.(.):([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \6
|
223 |
+
|
224 |
+
# "mm" means masculine unit and masculine subunit
|
225 |
+
# Usually used by Catalan users
|
226 |
+
CHF:(.+),(.+) $(\2mm: franc suís, francs suïssos, de francs suïssos, cèntim, cèntims, \1)
|
227 |
+
EUR:(.+),(.+) $(\2mm: euro, euros, d'euros, cèntim, cèntims, \1)
|
228 |
+
GBP:(.+),(.+) $(\2fm: lliura esterlina, lliures esterlines, de lliures esterlines, penic, penics, \1)
|
229 |
+
JPY:(.+),(.+) $(\2mm: ien, iens, de iens, sen, sen, \1)
|
230 |
+
USD:(.+),(.+) $(\2mm: dòlar dels EUA, dòlars dels EUA, de dòlars dels EUA, centau, centaus, \1)
|
231 |
+
# ACTIVE ISO 4217 CODES--A--
|
232 |
+
AED:(.+),(.+) $(\2mm: dírham dels Emirats Àrabs Units, dírhams dels Emirats Àrabs Units, de dírhams dels Emirats Àrabs Units, fils, fulús, \1)
|
233 |
+
AFN:(.+),(.+) $(\2mm: afgani, afganis, d'afganis, puli, puli, \1)
|
234 |
+
ALL:(.+),(.+) $(\2mm: lek, lekë, de lekë, qindarka, qindarka, \1)
|
235 |
+
AMD:(.+),(.+) $(\2mm: dram, drams, de drams, luma, luma, \1)
|
236 |
+
ANG:(.+),(.+) $(\2mm: florí de les Antilles Neerlandeses, florins de les Antilles Neerlandeses, de florins de les Antilles Neerlandeses, cèntim, cèntims, \1)
|
237 |
+
AOA:(.+),(.+) $(\2fm: kwanza, kwanzes, de kwanzes, cèntim cèntims, \1)
|
238 |
+
ARS:(.+),(.+) $(\2mm: peso argentí, pesos argentins, de pesos argentins, centau, centaus, \1)
|
239 |
+
AUD(.+),(.+) $(\2mm: dòlar australià, dòlars australians, de dòlars australians, centau, centaus, \1)
|
240 |
+
AWG:(.+),(.+) $(\2mm: florí d'Aruba, florins d'Aruba, de florins d'Aruba, cèntim, cèntims, \1)
|
241 |
+
AZN:(.+),(.+) $(\2mm: manat azerbaidjanès, manats azerbaidjanesos, de manats azerbaidjanesos, qəpik, qəpik, \1)
|
242 |
+
# ACTIVE ISO 4217 CODES --X--
|
243 |
+
#XAF Franc CFA emès pel BEAC (Banc dels Estats de l'Àfrica Central)
|
244 |
+
XAG:(.+),(.+) $(\2fm: unça de plata, unces de plata, d'unces de plata, cèntim, cèntims, \1)
|
245 |
+
XAU:(.+),(.+) $(\2fm: unça d'or, unces d'or, d'unces d'or, cèntim, cèntims, \1)
|
246 |
+
#XBA Unitat compensatòria europea (EURCO) (unitat per al mercat d'obligacions)
|
247 |
+
#XBB Unitat monetària europea (EMU-6) (unitat per al mercat d'obligacions)
|
248 |
+
#XBC Unitat de compte europea 9 (EUA-9) (unitat per al mercat d'obligacions)
|
249 |
+
#XBD Unitat de compte europea 17 (EUA-17) (unitat per al mercat d'obligacions)
|
250 |
+
#XCD Dòlar del Carib Oriental
|
251 |
+
#XDR Drets especials de gir (del Fons Monetari Internacional)
|
252 |
+
#XFU Franc UIC (divisa especial)
|
253 |
+
#XOF Franc CFA emès pel BCEAO (Banc Central dels Estats de l'Àfrica Occidental)
|
254 |
+
XPD:(.+),(.+) $(\2fm: unça de pal·ladi, unces de pal·ladi, d'unces de pal·ladi, cèntim, cèntims, \1)
|
255 |
+
#XPF Franc CFP (per als territoris francesos del Pacífic)
|
256 |
+
XPT:(.+),(.+) $(\2fm: unça de platí, unces de platí, d'unces de platí, cèntim, cèntims, \1)
|
257 |
+
#XTS Codi reservat per a proves
|
258 |
+
#XXX Sense moneda, sense transacció monetària
|
259 |
+
# OBSOLETE ISO 4217 CODES --Replaced by EUR--
|
260 |
+
ADF:(.+),(.+) $(\2mm: franc andorrà, francs andorrans, de francs andorrans, cèntim, cèntims, \1)
|
261 |
+
ADP:(.+),(.+) $(\2fm: pesseta andorrana, pessetes andorranes, de pessetes andorranes, cèntim, cèntims, \1)
|
262 |
+
ATS:(.+),(.+) $(\2mm: xíling austríac, xílings austríacs, de xílings austríacs, groschen, groschen, \1)
|
263 |
+
BEF:(.+),(.+) $(\2mm: franc belga, francs belgues, de francs belgues, cèntim, cèntims, \1)
|
264 |
+
CYP:(.+),(.+) $(\2mm: lliura xipriota, lliures xipriotes, de lliures xipriotes, cèntim, cèntims, \1)
|
265 |
+
DEM:(.+),(.+) $(\2mm: marc alemany, marcs alemanys, de marcs alemanys, penic, penics, \1)
|
266 |
+
ESP:(.+),(.+) $(\2fm: pesseta, pessetes, de pessetes, cèntim, cèntims, \1)
|
267 |
+
FIM:(.+),(.+) $(\2mm: marc finlandès, marcs finlandesos, de marcs finlandesos, penic, penics, \1)
|
268 |
+
FRF:(.+),(.+) $(\2mm: franc francès, francs francesos, de francs francesos, cèntim, cèntims, \1)
|
269 |
+
GRD:(.+),(.+) $(\2fm: dracma grega, dracmes gregues, leptó, leptà, \1)
|
270 |
+
IEP:(.+),(.+) $(\2fm: lliura irlandesa, lliures irlandeses, de lliures irlandeses, penic, penics, \1)
|
271 |
+
ITL:(.+),(.+) $(\2fm: lira italiana, lires italianes, de lires italianes, cèntim, cèntims, \1)
|
272 |
+
LUF:(.+),(.+) $(\2mm: franc luxemburguès, francs luxemburguesos, de francs luxemburguesos, cèntim, cèntims, \1)
|
273 |
+
MCF:(.+),(.+) $(\2mm: franc monegasc, francs monegascs, de francs monegascs, cèntim, cèntims, \1)
|
274 |
+
MTL:(.+),(.+) $(\2fm: lira maltesa, lires malteses, de lires malteses, cèntim, cèntims, \1)
|
275 |
+
NLG:(.+),(.+) $(\2mm: florí neerlandès, florins neerlandesos, de florins neerlandesos, cèntim, cèntims, \1)
|
276 |
+
PTE:(.+),(.+) $(\2mm: escut portuguès, escuts portuguesos, de escuts portuguesos, centau, centaus, \1)
|
277 |
+
SIT:(.+),(.+) $(\2mm: tolar eslovè, tolars eslovens, de tolars eslovens, stotin, stotinov, \1)
|
278 |
+
SKK:(.+),(.+) $(\2fm: corona eslovaca, corones eslovaques, de corones eslovaques, halier, halierov, \1)
|
279 |
+
SML:(.+),(.+) $(\2fm: lira de San Marino, lires de San Marino, de lires de San Marino, cèntim, cèntims, \1)
|
280 |
+
VAL:(.+),(.+) $(\2fm: lira vaticana, lires vaticanes, de lires vaticanes, cèntim, cèntims, \1)
|
281 |
+
XEU:(.+),(.+) $(\2mm: ecu, ecus, d'ecus, cèntim, cèntims, \1)
|
282 |
+
|
283 |
+
#crypto-currencies
|
284 |
+
XMR:(.+),(.+) $(\2mm: monero, moneros, de moneros, piconero, piconeros, \1) #TODO: 1,000,000,000,000 piconeros = 1 monero
|
285 |
+
XBT:(.+),(.+) $(\2mm: bitcoin, bitcoins, de bitcoins, satoshi, satoshis, \1) # TODO: 10,000,000 satoshis = 1,000 millibitcoin = 1 bitcoin
|
286 |
+
|
287 |
+
# unknow currency
|
288 |
+
[A-Z]{3}:.* ""
|
289 |
+
|
290 |
+
|
291 |
+
"([A-Z]{3}) ([-−]?1)([.,]00?)?"$(\1:|$2,us)
|
292 |
+
"([A-Z]{3}) ([-−]?\d+0{6,})([.,]00?)?"$(\1:|$2,ud)
|
293 |
+
"([A-Z]{3}) ([-−]?\d+)([.,]00?)?"$(\1:|$2,up)
|
294 |
+
"(([A-Z]{3}) [-−]?\d+)[.,](01)" $1 amb$(\2:un,ss)
|
295 |
+
"(([A-Z]{3}) [-−]?\d+)[.,](\d)" $1 amb$(\2:|$(\30),sp)
|
296 |
+
"(([A-Z]{3}) [-−]?\d+)[.,](\d\d)" $1 amb$(\2:|$3,sp)
|
297 |
+
|
298 |
+
|
299 |
+
# detects number followed by currency code
|
300 |
+
"([-−]?\d+)([.,]\d+)? ([A-Z]{3})" $(\3 \1\2)
|
301 |
+
|
302 |
+
|
303 |
+
# currency symbols
|
304 |
+
"€[ ]?([^ ]*)" $(EUR \1)
|
305 |
+
"£[ ]?([^ ]*)" $(GBP \1)
|
306 |
+
"\$[ ]?([^ ]*)" $(USD \1)
|
307 |
+
"¥[ ]?([^ ]*)" $(JPY \1)
|
308 |
+
"₩[ ]?([^ ]*)" $(KRW \1)
|
309 |
+
"₽[ ]?([^ ]*)" $(RUB \1)
|
310 |
+
"ɱ[ ]?([^ ]*)" $(XMR \1)
|
311 |
+
"₿[ ]?([^ ]*)" $(XBT \1)
|
312 |
+
|
313 |
+
"([^ ]+)[ ]?€$" $(EUR \1)
|
314 |
+
"([^ ]+)[ ]?£$" $(GBP \1)
|
315 |
+
"([^ ]+)[ ]?\$$" $(USD \1)
|
316 |
+
"([^ ]+)[ ]?¥$" $(JPY \1)
|
317 |
+
"([^ ]+)[ ]?₩$" $(KRW \1)
|
318 |
+
"([^ ]+)[ ]?₽$" $(RUB \1)
|
319 |
+
"([^ ]+)[ ]?ɱ$" $(XMR \1)
|
320 |
+
"([^ ]+)[ ]?₿$" $(XBT \1)
|
321 |
+
|
322 |
+
== feminine ==
|
323 |
+
|
324 |
+
1 una
|
325 |
+
(.*) $(f:|$1)
|
326 |
+
|
327 |
+
== masculine ==
|
328 |
+
|
329 |
+
1 un
|
330 |
+
(.*) $1
|
331 |
+
|
332 |
+
== ordinal(-masculine)? ==
|
333 |
+
|
334 |
+
([-−]\d+) ""
|
335 |
+
\d+[,.] ""
|
336 |
+
0 zeroé # [:ca-valencia:] [:ca-ES-valencia:]
|
337 |
+
0 zeroè
|
338 |
+
1 primer
|
339 |
+
2 segon
|
340 |
+
3 tercer
|
341 |
+
4 quart
|
342 |
+
(\d+)$ $(ordinal $2)
|
343 |
+
"un ([^ ]*(ilió|iliard))$" $(ordinal \2)
|
344 |
+
(.*li)ó$ \2oné # [:ca-valencia:] [:ca-ES-valencia:]
|
345 |
+
(.*li)ó$ \2onè
|
346 |
+
(.*(cent|mil|ion|iliard))s?$ \2é # [:ca-valencia:] [:ca-ES-valencia:]
|
347 |
+
(.*(cent|mil|ion|iliard))s?$ \2è
|
348 |
+
"(.* )u$" \2uné # [:ca-valencia:] [:ca-ES-valencia:]
|
349 |
+
"(.* )u$" \2unè
|
350 |
+
(.*-)u$ \2uné # [:ca-valencia:] [:ca-ES-valencia:]
|
351 |
+
(.*-)u$ \2unè
|
352 |
+
"u" primer
|
353 |
+
"un" primer
|
354 |
+
"dos" segon
|
355 |
+
"tres" terç
|
356 |
+
"quatre" quart
|
357 |
+
(.*)cinc$ \2cinqué # [:ca-valencia:] [:ca-ES-valencia:]
|
358 |
+
(.*)cinc$ \2cinquè
|
359 |
+
(.*)dènou$ \2denové # [:ca-valencia:] [:ca-ES-valencia:]
|
360 |
+
(.*)nou$ \2nové # [:ca-valencia:] [:ca-ES-valencia:]
|
361 |
+
(.*)nou$ \2novè
|
362 |
+
(.*)deu$ \2desé # [:ca-valencia:] [:ca-ES-valencia:]
|
363 |
+
(.*)deu$ \2desè
|
364 |
+
(.*)dèsset$ \2desseté # [:ca-valencia:] [:ca-ES-valencia:]
|
365 |
+
(.*)díhuit$ \2dihuité # [:ca-valencia:] [:ca-ES-valencia:]
|
366 |
+
(.*)[ae]$ \2é # [:ca-valencia:] [:ca-ES-valencia:]
|
367 |
+
(.*)[ae]$ \2è
|
368 |
+
(.*\D)$ \2é # [:ca-valencia:] [:ca-ES-valencia:]
|
369 |
+
(.*\D)$ \2è
|
370 |
+
|
371 |
+
== ordinal-feminine ==
|
372 |
+
([-−]\d+) ""
|
373 |
+
\d+[,.] ""
|
374 |
+
(\d+)$ $(no-centes:$(f:$(ordinal \1)))
|
375 |
+
|
376 |
+
== ordinal-masculine-plural ==
|
377 |
+
|
378 |
+
([-−]?\d+) $(ordinal-masculine-plural $(ordinal \1))
|
379 |
+
primer primers
|
380 |
+
segon segons
|
381 |
+
(.*)è \1ens
|
382 |
+
(.*)er \1ers
|
383 |
+
|
384 |
+
== ordinal-feminine-plural ==
|
385 |
+
|
386 |
+
([-−]?\d+) $(ordinal-feminine-plural $(ordinal-feminine \1))
|
387 |
+
(.*)a \1es
|
388 |
+
|
389 |
+
== ordinal-number(-masculine)? ==
|
390 |
+
|
391 |
+
#(\d+) $(o:\2)
|
392 |
+
1$ 1r
|
393 |
+
2$ 2n
|
394 |
+
3$ 3r
|
395 |
+
4$ 4t
|
396 |
+
(\d+)$ \2é # [:ca-valencia:] [:ca-ES-valencia:]
|
397 |
+
(\d+)$ \2è
|
398 |
+
|
399 |
+
== ordinal-number-feminine ==
|
400 |
+
(\d+)$ \1a
|
401 |
+
|
402 |
+
== partitive(-masculine)? ==
|
403 |
+
([--]?\d+) $(p:$(ordinal \2))
|
404 |
+
|
405 |
+
== partitive-feminine ==
|
406 |
+
([--]?\d+) $(no-centes:$(f:$(p:$(ordinal \1))))
|
407 |
+
|
408 |
+
|
409 |
+
== partitive(-masculine)?-plural ==
|
410 |
+
([--]?\d+) $(pl:$(p:$(ordinal $2)))
|
411 |
+
|
412 |
+
== partitive-feminine-plural ==
|
413 |
+
([--]?\d+) $(no-centes:$(pl:$(f:$(p:$(ordinal $1)))))
|
414 |
+
|
415 |
+
== fraction(-masculine)? ==
|
416 |
+
([--]?1)(/1)? $2
|
417 |
+
([--]?1)/2 mig
|
418 |
+
([--]?1)/([3-9]\d*) $(masculine \2)| $(partitive \3)
|
419 |
+
([--]?\d+)(/1)? $2
|
420 |
+
([--]?\d+)/([1-9]\d*) $2| $(partitive-plural \3)
|
421 |
+
|
422 |
+
== fraction-feminine ==
|
423 |
+
([--]?1)(/1)? $(f:$1)| unitat
|
424 |
+
([--]?1)/([1-9]\d*) $(f:$1)| $(partitive-feminine \2)| part
|
425 |
+
([--]?\d+)(/1)? $(f:$1)| unitats
|
426 |
+
([--]?\d+)/([1-9]\d*) $(f:$1)| $(partitive-feminine-plural \2)| parts
|
427 |
+
|
428 |
+
== collective ==
|
429 |
+
2 parell, parella o duo
|
430 |
+
3 tern, terna, tercet, trio, tríada o treset
|
431 |
+
4 qüern, tètrada, quartet, quarteta o quàdruple
|
432 |
+
5 quintern, quintet, cinquet o quíntuple
|
433 |
+
6 sextet, siset o sèxtuple
|
434 |
+
7 septet, setet o sèptuple
|
435 |
+
8 octet o òctuple
|
436 |
+
9 nònuple
|
437 |
+
10 dècada o dècuple
|
438 |
+
12 dotzena
|
439 |
+
100 centenar
|
440 |
+
144 grossa
|
441 |
+
1000 miler
|
442 |
+
10000 miríada
|
443 |
+
|
444 |
+
== years ==
|
445 |
+
2 bienni
|
446 |
+
3 trienni
|
447 |
+
4 quadrienni
|
448 |
+
5 quinquenni o lustre
|
449 |
+
6 sesenni
|
450 |
+
7 septenni
|
451 |
+
10 dècada o decenni
|
452 |
+
12 duodecenni
|
453 |
+
15 quindecenni
|
454 |
+
20 vintenni o vicenni
|
455 |
+
30 trentenni o tricenni
|
456 |
+
40 quarantenni
|
457 |
+
50 cinquantenni
|
458 |
+
60 seixantenni
|
459 |
+
70 setantenni
|
460 |
+
80 huitantenni [:ca-valencia:] [:ca-ES-valencia:]
|
461 |
+
80 vuitantenni
|
462 |
+
90 norantenni
|
463 |
+
100 segle o centenni
|
464 |
+
1000 mil·lenni
|
465 |
+
|
466 |
+
== multiplicative ==
|
467 |
+
2 doble o duple
|
468 |
+
3 triple
|
469 |
+
4 quàdruple
|
470 |
+
5 quíntuple
|
471 |
+
6 sèxtuple
|
472 |
+
7 sèptuple
|
473 |
+
8 òctuple
|
474 |
+
9 nònuple
|
475 |
+
10 dècuple
|
476 |
+
12 duodècuple
|
477 |
+
100 cèntuple
|
478 |
+
1/10 subdècuple
|
479 |
+
1/2 súbduple
|
480 |
+
|
481 |
+
== help ==
|
482 |
+
|
483 |
+
"" $(1)|, $(2), $(3)\n$(help feminine)$(help masculine)$(help ordinal-number-masculine)$(help ordinal-number-feminine)$(help ordinal-feminine)$(help ordinal-masculine)
|
484 |
+
(feminine|masculine|ordinal(-number)?(-feminine|-masculine)?) \1: $(\1 1), $(\1 2), $(\1 3)\n
|
485 |
+
|
text/cleaners.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
'''
|
4 |
+
Cleaners are transformations that run over the input text at both training and eval time.
|
5 |
+
|
6 |
+
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
7 |
+
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
8 |
+
1. "english_cleaners" for English text
|
9 |
+
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
10 |
+
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
11 |
+
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
12 |
+
the symbols in symbols.py to match your data).
|
13 |
+
'''
|
14 |
+
|
15 |
+
import re
|
16 |
+
from unidecode import unidecode
|
17 |
+
from text.numbers import normalize_numbers
|
18 |
+
from text.numbers_ca import normalize_numbers_ca
|
19 |
+
from text.symbols import symbols
|
20 |
+
|
21 |
+
# Regular expression matching whitespace:
|
22 |
+
_whitespace_re = re.compile(r'\s+')
|
23 |
+
|
24 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
25 |
+
_abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
26 |
+
('mrs', 'misess'),
|
27 |
+
('mr', 'mister'),
|
28 |
+
('dr', 'doctor'),
|
29 |
+
('st', 'saint'),
|
30 |
+
('co', 'company'),
|
31 |
+
('jr', 'junior'),
|
32 |
+
('maj', 'major'),
|
33 |
+
('gen', 'general'),
|
34 |
+
('drs', 'doctors'),
|
35 |
+
('rev', 'reverend'),
|
36 |
+
('lt', 'lieutenant'),
|
37 |
+
('hon', 'honorable'),
|
38 |
+
('sgt', 'sergeant'),
|
39 |
+
('capt', 'captain'),
|
40 |
+
('esq', 'esquire'),
|
41 |
+
('ltd', 'limited'),
|
42 |
+
('col', 'colonel'),
|
43 |
+
('ft', 'fort'),
|
44 |
+
]]
|
45 |
+
|
46 |
+
# List of (regular expression, replacement) pairs for catalan abbreviations:
|
47 |
+
_abbreviations_ca = [(re.compile('\\b%s\\b' % x[0], re.IGNORECASE), x[1]) for x in [
|
48 |
+
('tv3', 't v tres'),
|
49 |
+
('8tv', 'vuit t v'),
|
50 |
+
('pp', 'p p'),
|
51 |
+
('psoe', 'p soe'),
|
52 |
+
('sr.?', 'senyor'),
|
53 |
+
('sra.?', 'senyora'),
|
54 |
+
('srta.?', 'senyoreta')
|
55 |
+
]]
|
56 |
+
|
57 |
+
_replacements_ca = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
58 |
+
(';', ','),
|
59 |
+
(':', '\.'),
|
60 |
+
('\.\.\.,', ','),
|
61 |
+
('\.\.\.', '…'),
|
62 |
+
('ñ','ny')
|
63 |
+
]]
|
64 |
+
|
65 |
+
|
66 |
+
def expand_abbreviations(text, lang='ca'):
|
67 |
+
if lang == 'en':
|
68 |
+
_abbreviations = _abbreviations_en
|
69 |
+
elif lang == 'ca':
|
70 |
+
_abbreviations = _abbreviations_ca
|
71 |
+
else:
|
72 |
+
raise ValueError('no %s language for abbreviations'%lang)
|
73 |
+
for regex, replacement in _abbreviations:
|
74 |
+
text = re.sub(regex, replacement, text)
|
75 |
+
return text
|
76 |
+
|
77 |
+
|
78 |
+
def convert_characters(text, lang='ca'):
|
79 |
+
if lang == 'ca':
|
80 |
+
_replacements = _replacements_ca
|
81 |
+
else:
|
82 |
+
raise ValueError('no %s language for punctuation conversion'%lang)
|
83 |
+
for regex, replacement in _replacements_ca:
|
84 |
+
text = re.sub(regex, replacement, text)
|
85 |
+
return text
|
86 |
+
|
87 |
+
|
88 |
+
def expand_numbers(text, lang="ca"):
|
89 |
+
if lang == 'ca':
|
90 |
+
return normalize_numbers_ca(text)
|
91 |
+
else:
|
92 |
+
return normalize_numbers(text)
|
93 |
+
|
94 |
+
|
95 |
+
def lowercase(text):
|
96 |
+
return text.lower()
|
97 |
+
|
98 |
+
|
99 |
+
def collapse_whitespace(text):
|
100 |
+
return re.sub(_whitespace_re, ' ', text)
|
101 |
+
|
102 |
+
|
103 |
+
def convert_to_ascii(text, lang="ca"):
|
104 |
+
if lang == 'en':
|
105 |
+
return unidecode(text)
|
106 |
+
elif lang == 'ca':
|
107 |
+
char_replace = []
|
108 |
+
for t in set(list(text)):
|
109 |
+
if t not in symbols:
|
110 |
+
char_replace.append([t, unidecode(t)])
|
111 |
+
for target, replace in char_replace:
|
112 |
+
text = text.replace(target, replace)
|
113 |
+
return text
|
114 |
+
else:
|
115 |
+
raise ValueError('no %s language for punctuation conversion'%lang)
|
116 |
+
|
117 |
+
|
118 |
+
def basic_cleaners(text):
|
119 |
+
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
120 |
+
text = lowercase(text)
|
121 |
+
text = collapse_whitespace(text)
|
122 |
+
return text
|
123 |
+
|
124 |
+
|
125 |
+
def transliteration_cleaners(text):
|
126 |
+
'''Pipeline for non-English text that transliterates to ASCII.'''
|
127 |
+
text = convert_to_ascii(text)
|
128 |
+
text = lowercase(text)
|
129 |
+
text = collapse_whitespace(text)
|
130 |
+
return text
|
131 |
+
|
132 |
+
|
133 |
+
def english_cleaners(text):
|
134 |
+
'''Pipeline for English text, including number and abbreviation expansion.'''
|
135 |
+
text = convert_to_ascii(text)
|
136 |
+
text = lowercase(text)
|
137 |
+
text = expand_numbers(text, lang='en')
|
138 |
+
text = expand_abbreviations(text, lang='en')
|
139 |
+
text = collapse_whitespace(text)
|
140 |
+
return text
|
141 |
+
|
142 |
+
|
143 |
+
def catalan_cleaners(text):
|
144 |
+
text = lowercase(text)
|
145 |
+
text = expand_numbers(text, lang="ca")
|
146 |
+
text = convert_characters(text, lang="ca")
|
147 |
+
text = convert_to_ascii(text, lang="ca")
|
148 |
+
text = expand_abbreviations(text, lang="ca")
|
149 |
+
text = collapse_whitespace(text)
|
150 |
+
return text
|
text/cmudict.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
import re
|
4 |
+
|
5 |
+
|
6 |
+
valid_symbols = [
|
7 |
+
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
|
8 |
+
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
|
9 |
+
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
|
10 |
+
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
|
11 |
+
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
|
12 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
|
13 |
+
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
|
14 |
+
]
|
15 |
+
|
16 |
+
_valid_symbol_set = set(valid_symbols)
|
17 |
+
|
18 |
+
|
19 |
+
class CMUDict:
|
20 |
+
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
|
21 |
+
def __init__(self, file_or_path, keep_ambiguous=True):
|
22 |
+
if isinstance(file_or_path, str):
|
23 |
+
with open(file_or_path, encoding='latin-1') as f:
|
24 |
+
entries = _parse_cmudict(f)
|
25 |
+
else:
|
26 |
+
entries = _parse_cmudict(file_or_path)
|
27 |
+
if not keep_ambiguous:
|
28 |
+
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
29 |
+
self._entries = entries
|
30 |
+
|
31 |
+
|
32 |
+
def __len__(self):
|
33 |
+
return len(self._entries)
|
34 |
+
|
35 |
+
|
36 |
+
def lookup(self, word):
|
37 |
+
'''Returns list of ARPAbet pronunciations of the given word.'''
|
38 |
+
return self._entries.get(word.upper())
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
_alt_re = re.compile(r'\([0-9]+\)')
|
43 |
+
|
44 |
+
|
45 |
+
def _parse_cmudict(file):
|
46 |
+
cmudict = {}
|
47 |
+
for line in file:
|
48 |
+
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
|
49 |
+
parts = line.split(' ')
|
50 |
+
word = re.sub(_alt_re, '', parts[0])
|
51 |
+
pronunciation = _get_pronunciation(parts[1])
|
52 |
+
if pronunciation:
|
53 |
+
if word in cmudict:
|
54 |
+
cmudict[word].append(pronunciation)
|
55 |
+
else:
|
56 |
+
cmudict[word] = [pronunciation]
|
57 |
+
return cmudict
|
58 |
+
|
59 |
+
|
60 |
+
def _get_pronunciation(s):
|
61 |
+
parts = s.strip().split(' ')
|
62 |
+
for part in parts:
|
63 |
+
if part not in _valid_symbol_set:
|
64 |
+
return None
|
65 |
+
return ' '.join(parts)
|
text/numbers.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
import inflect
|
4 |
+
import re
|
5 |
+
|
6 |
+
|
7 |
+
_inflect = inflect.engine()
|
8 |
+
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
9 |
+
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
10 |
+
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
11 |
+
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
12 |
+
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
13 |
+
_number_re = re.compile(r'[0-9]+')
|
14 |
+
|
15 |
+
|
16 |
+
def _remove_commas(m):
|
17 |
+
return m.group(1).replace(',', '')
|
18 |
+
|
19 |
+
|
20 |
+
def _expand_decimal_point(m):
|
21 |
+
return m.group(1).replace('.', ' point ')
|
22 |
+
|
23 |
+
|
24 |
+
def _expand_dollars(m):
|
25 |
+
match = m.group(1)
|
26 |
+
parts = match.split('.')
|
27 |
+
if len(parts) > 2:
|
28 |
+
return match + ' dollars' # Unexpected format
|
29 |
+
dollars = int(parts[0]) if parts[0] else 0
|
30 |
+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
31 |
+
if dollars and cents:
|
32 |
+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
33 |
+
cent_unit = 'cent' if cents == 1 else 'cents'
|
34 |
+
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
35 |
+
elif dollars:
|
36 |
+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
37 |
+
return '%s %s' % (dollars, dollar_unit)
|
38 |
+
elif cents:
|
39 |
+
cent_unit = 'cent' if cents == 1 else 'cents'
|
40 |
+
return '%s %s' % (cents, cent_unit)
|
41 |
+
else:
|
42 |
+
return 'zero dollars'
|
43 |
+
|
44 |
+
|
45 |
+
def _expand_ordinal(m):
|
46 |
+
return _inflect.number_to_words(m.group(0))
|
47 |
+
|
48 |
+
|
49 |
+
def _expand_number(m):
|
50 |
+
num = int(m.group(0))
|
51 |
+
if num > 1000 and num < 3000:
|
52 |
+
if num == 2000:
|
53 |
+
return 'two thousand'
|
54 |
+
elif num > 2000 and num < 2010:
|
55 |
+
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
56 |
+
elif num % 100 == 0:
|
57 |
+
return _inflect.number_to_words(num // 100) + ' hundred'
|
58 |
+
else:
|
59 |
+
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
60 |
+
else:
|
61 |
+
return _inflect.number_to_words(num, andword='')
|
62 |
+
|
63 |
+
|
64 |
+
def normalize_numbers(text):
|
65 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
66 |
+
text = re.sub(_pounds_re, r'\1 pounds', text)
|
67 |
+
text = re.sub(_dollars_re, _expand_dollars, text)
|
68 |
+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
69 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
70 |
+
text = re.sub(_number_re, _expand_number, text)
|
71 |
+
return text
|
text/numbers_ca.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import io
|
3 |
+
import pathlib
|
4 |
+
from text.soros import compile
|
5 |
+
|
6 |
+
filepath = pathlib.Path(__file__).parent.absolute()
|
7 |
+
with io.open(f"{filepath}/ca.sor", 'r', encoding="utf-8") as prg:
|
8 |
+
num2text = compile(prg.read(), 'ca')
|
9 |
+
|
10 |
+
_separador_milers_re = re.compile(r'([0-9][0-9\.]+[0-9]{3})')
|
11 |
+
_decimal_re = re.compile(r'([0-9]+\,[0-9]+)')
|
12 |
+
_ordinal_ms_re = re.compile(r'([0-9]+)(r|er|n|on|t|rt|è|e|ne|nè)+(\b)')
|
13 |
+
_ordinal_mp_re = re.compile(r'([0-9]+)(rs|ns|ts|ns)+(\b)')
|
14 |
+
_ordinal_fs_re = re.compile(r'([0-9]+)(a|ra|na|ta)+(\b)')
|
15 |
+
_ordinal_fp_re = re.compile(r'([0-9]+)(es)+(\b)')
|
16 |
+
_cardinal_re = re.compile(r'[0-9]+')
|
17 |
+
_fraccions_re = re.compile(r'(\b)([0-9]+\/[0-9]+)(\b)')
|
18 |
+
_hores_re = re.compile(r'(\b)([0-9]{1,2}):([0-9]{2})(\b)')
|
19 |
+
|
20 |
+
def _esborra_separador_milers(m):
|
21 |
+
return m.group(1).replace('.', '')
|
22 |
+
|
23 |
+
def _num2text(m):
|
24 |
+
return num2text.run(m.group(0))
|
25 |
+
|
26 |
+
def _ordinal_ms(m):
|
27 |
+
return num2text.run(f"ordinal {m.group(1)}") + m.group(3)
|
28 |
+
|
29 |
+
def _ordinal_mp(m):
|
30 |
+
return num2text.run(f"ordinal-masculine-plural {m.group(1)}") + m.group(3)
|
31 |
+
|
32 |
+
def _ordinal_fs(m):
|
33 |
+
return num2text.run(f"ordinal-feminine {m.group(1)}") + m.group(3)
|
34 |
+
|
35 |
+
def _ordinal_fp(m):
|
36 |
+
return num2text.run(f"ordinal-feminine-plural {m.group(1)}") + m.group(3)
|
37 |
+
|
38 |
+
def _fraccions(m):
|
39 |
+
return m.group(1) + num2text.run(f"fraction {m.group(2)}") + m.group(3)
|
40 |
+
|
41 |
+
def _hores(m):
|
42 |
+
return m.group(1) + num2text.run(m.group(2)) + " i " + num2text.run(m.group(3)) + m.group(4)
|
43 |
+
|
44 |
+
def normalize_numbers_ca(text):
|
45 |
+
text = re.sub(_separador_milers_re, _esborra_separador_milers, text)
|
46 |
+
text = re.sub(_decimal_re, _num2text, text)
|
47 |
+
text = re.sub(_ordinal_ms_re, _ordinal_ms, text)
|
48 |
+
text = re.sub(_ordinal_mp_re, _ordinal_mp, text)
|
49 |
+
text = re.sub(_ordinal_fs_re, _ordinal_fs, text)
|
50 |
+
text = re.sub(_ordinal_fp_re, _ordinal_fp, text)
|
51 |
+
text = re.sub(_fraccions_re, _fraccions, text)
|
52 |
+
text = re.sub(_hores_re, _hores, text)
|
53 |
+
text = re.sub(_cardinal_re, _num2text, text)
|
54 |
+
return text
|
text/numbers_ca_test.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
|
3 |
+
from text.numbers_ca import normalize_numbers_ca
|
4 |
+
|
5 |
+
|
6 |
+
class NumbersCa(unittest.TestCase):
|
7 |
+
def test_cardinals(self):
|
8 |
+
"""
|
9 |
+
Converteix cardinals simples en una frase
|
10 |
+
"""
|
11 |
+
self.assertEqual(normalize_numbers_ca("Va nèixer el 23 de desembre de 1988"), "Va nèixer el vint-i-tres de desembre de mil nou-cents vuitanta-vuit")
|
12 |
+
self.assertEqual(normalize_numbers_ca("tinc 3 preguntes"), "tinc tres preguntes")
|
13 |
+
|
14 |
+
def test_separador_milers(self):
|
15 |
+
"""
|
16 |
+
Ignora separadors de milers
|
17 |
+
"""
|
18 |
+
self.assertEqual(normalize_numbers_ca("1.000"), "mil")
|
19 |
+
self.assertEqual(normalize_numbers_ca("323.400"), "tres-cents vint-i-tres mil quatre-cents")
|
20 |
+
self.assertEqual(normalize_numbers_ca("900.323.400"), "nou-cents milions tres-cents vint-i-tres mil quatre-cents")
|
21 |
+
|
22 |
+
def test_decimals(self):
|
23 |
+
"""
|
24 |
+
Converteix decimals
|
25 |
+
"""
|
26 |
+
self.assertEqual(normalize_numbers_ca("1,33"), "u coma trenta-tres")
|
27 |
+
self.assertEqual(normalize_numbers_ca("75,5"), "setanta-cinc coma cinc")
|
28 |
+
self.assertEqual(normalize_numbers_ca("75,555"), "setanta-cinc coma cinc-cents cinquanta-cinc")
|
29 |
+
self.assertEqual(normalize_numbers_ca("999.999.999,99"), "nou-cents noranta-nou milions nou-cents noranta-nou mil nou-cents noranta-nou coma noranta-nou")
|
30 |
+
self.assertEqual(normalize_numbers_ca("1,12345678900"), "u coma dotze trenta-quatre cinquanta-sis set-cents vuitanta-nou")
|
31 |
+
|
32 |
+
def test_decimals_2(self):
|
33 |
+
"""
|
34 |
+
Ignora comes que no pertànyen a un número decimal
|
35 |
+
"""
|
36 |
+
self.assertEqual(normalize_numbers_ca("Va comprar pa, vi i llonganisses"), "Va comprar pa, vi i llonganisses")
|
37 |
+
self.assertEqual(normalize_numbers_ca("El número guanyador és 1, 23, 55, 34"), "El número guanyador és u, vint-i-tres, cinquanta-cinc, trenta-quatre")
|
38 |
+
|
39 |
+
def test_ordinals_ms(self):
|
40 |
+
"""
|
41 |
+
Converteix ordinals masculins singulars
|
42 |
+
"""
|
43 |
+
self.assertEqual(normalize_numbers_ca("Va arribar 4t de 5"), "Va arribar quart de cinc")
|
44 |
+
self.assertEqual(normalize_numbers_ca("el 1r va ser ell"), "el primer va ser ell")
|
45 |
+
self.assertEqual(normalize_numbers_ca("el 3er, no va aguantar"), "el tercer, no va aguantar")
|
46 |
+
self.assertEqual(normalize_numbers_ca("2n"), "segon")
|
47 |
+
self.assertEqual(normalize_numbers_ca("2on"), "segon")
|
48 |
+
self.assertEqual(normalize_numbers_ca("4t"), "quart")
|
49 |
+
self.assertEqual(normalize_numbers_ca("4rt"), "quart")
|
50 |
+
self.assertEqual(normalize_numbers_ca("5è: remogueu la barreja"), "cinquè: remogueu la barreja")
|
51 |
+
self.assertEqual(normalize_numbers_ca("6e"), "sisè")
|
52 |
+
self.assertEqual(normalize_numbers_ca("6e"), "sisè")
|
53 |
+
self.assertEqual(normalize_numbers_ca("21nè"), "vint-i-unè")
|
54 |
+
self.assertEqual(normalize_numbers_ca("un 81ne de Palamós"), "un vuitanta-unè de Palamós")
|
55 |
+
|
56 |
+
def test_ordinals_fs(self):
|
57 |
+
"""
|
58 |
+
Converteix ordinals femenins singulars
|
59 |
+
"""
|
60 |
+
self.assertEqual(normalize_numbers_ca("1a"), "primera")
|
61 |
+
self.assertEqual(normalize_numbers_ca("3ra"), "tercera")
|
62 |
+
self.assertEqual(normalize_numbers_ca("2a"), "segona")
|
63 |
+
self.assertEqual(normalize_numbers_ca("2na"), "segona")
|
64 |
+
self.assertEqual(normalize_numbers_ca("4a."), "quarta.")
|
65 |
+
self.assertEqual(normalize_numbers_ca("pugi a la 4ta, després giri a l'esquerra"), "pugi a la quarta, després giri a l'esquerra")
|
66 |
+
self.assertEqual(normalize_numbers_ca("va quedar 5a en la classificació"), "va quedar cinquena en la classificació")
|
67 |
+
self.assertEqual(normalize_numbers_ca("la 5na vegada"), "la cinquena vegada")
|
68 |
+
|
69 |
+
def test_ordinals_mp(self):
|
70 |
+
"""
|
71 |
+
Converteix ordinals masculins plurals
|
72 |
+
"""
|
73 |
+
self.assertEqual(normalize_numbers_ca("1rs"), "primers")
|
74 |
+
self.assertEqual(normalize_numbers_ca("van arribar 2ns"), "van arribar segons")
|
75 |
+
|
76 |
+
def test_ordinals_fp(self):
|
77 |
+
"""
|
78 |
+
Converteix ordinals femenins plurals
|
79 |
+
"""
|
80 |
+
self.assertEqual(normalize_numbers_ca("1es"), "primeres")
|
81 |
+
|
82 |
+
def test_fraccions_s(self):
|
83 |
+
"""
|
84 |
+
Converteix fraccions singulars
|
85 |
+
"""
|
86 |
+
self.assertEqual(normalize_numbers_ca("1/2 got de vi"), "mig got de vi")
|
87 |
+
self.assertEqual(normalize_numbers_ca("1/3 de farina"), "un terç de farina")
|
88 |
+
self.assertEqual(normalize_numbers_ca("1/8"), "un vuitè")
|
89 |
+
|
90 |
+
def test_fraccions_p(self):
|
91 |
+
"""
|
92 |
+
Converteix fraccions plurals
|
93 |
+
"""
|
94 |
+
self.assertEqual(normalize_numbers_ca("4/2 gots de vi"), "quatre migs gots de vi")
|
95 |
+
self.assertEqual(normalize_numbers_ca("2/3 de farina"), "dos terços de farina")
|
96 |
+
self.assertEqual(normalize_numbers_ca("3/8"), "tres vuitens")
|
97 |
+
|
98 |
+
def test_hores(self):
|
99 |
+
"""
|
100 |
+
Converteix hores de manera simplificada
|
101 |
+
"""
|
102 |
+
self.assertEqual(normalize_numbers_ca("a les 11:45"), "a les onze i quaranta-cinc")
|
103 |
+
self.assertEqual(normalize_numbers_ca("a partir de les 23:12"), "a partir de les vint-i-tres i dotze")
|
104 |
+
|
105 |
+
if __name__ == '__main__':
|
106 |
+
unittest.main()
|
text/soros.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"Soros interpreter (see http://numbertext.org)"
|
2 |
+
from __future__ import unicode_literals
|
3 |
+
from __future__ import print_function
|
4 |
+
import re
|
5 |
+
import sys
|
6 |
+
|
7 |
+
|
8 |
+
def run(program, data, lang):
|
9 |
+
return compile(program, lang).run(data)
|
10 |
+
|
11 |
+
|
12 |
+
def compile(program, lang):
|
13 |
+
return _Soros(program, lang)
|
14 |
+
|
15 |
+
# conversion function
|
16 |
+
|
17 |
+
|
18 |
+
def _tr(text, chars, chars2, delim):
|
19 |
+
for i in range(0, len(chars)):
|
20 |
+
text = text.replace(delim + chars[i], chars2[i])
|
21 |
+
return text
|
22 |
+
|
23 |
+
|
24 |
+
# string literals for metacharacter encoding
|
25 |
+
_m = "\\\";#$()|[]"
|
26 |
+
# Unicode private area
|
27 |
+
_c = u"\uE000\uE001\uE002\uE003\uE004\uE005\uE006\uE007\uE008\uE009"
|
28 |
+
_pipe = u"\uE003"
|
29 |
+
# separator prefix = \uE00A
|
30 |
+
|
31 |
+
# pattern to recognize function calls in the replacement string
|
32 |
+
_func = re.compile(_tr(r"""(?:\|?(?:\$\()+)? # optional nested calls
|
33 |
+
(\|?\$\(([^\(\)]*)\)\|?) # inner call (2 subgroups)
|
34 |
+
(?:\)+\|?)?""", # optional nested calls
|
35 |
+
_m[4:8], _c[:4], "\\"), re.X) # \$, \(, \), \| -> \uE000..\uE003
|
36 |
+
|
37 |
+
|
38 |
+
class _Soros:
|
39 |
+
def __init__(self, prg, lang):
|
40 |
+
self.lines = []
|
41 |
+
if prg.find("__numbertext__") == -1:
|
42 |
+
prg = "__numbertext__;" + prg
|
43 |
+
# default left zero deletion
|
44 |
+
# and separator function (no separation, if subcall returns with empty string)
|
45 |
+
prg = prg.replace("__numbertext__", u"""0+(0|[1-9]\\d*) $1
|
46 |
+
\"([a-z][-a-z]* )0+(0|[1-9]\\d*)\" $(\\1\\2)
|
47 |
+
\"\uE00A(.*)\uE00A(.+)\uE00A(.*)\" \\1\\2\\3
|
48 |
+
\"\uE00A.*\uE00A\uE00A.*\"
|
49 |
+
""")
|
50 |
+
prg = _tr(prg, _m[:4], _c[:4],
|
51 |
+
"\\") # \\, \", \;, \# -> \uE000..\uE003
|
52 |
+
# switch off all country-dependent lines, and switch on the requested ones
|
53 |
+
prg = re.sub(
|
54 |
+
r"(^|[\n;])([^\n;#]*#[^\n]*[\[]:[^\n:\]]*:][^\n]*)", r"\1#\2", prg)
|
55 |
+
prg = re.sub(r"(^|[\n;])#([^\n;#]*#[^\n]*[\[]:" +
|
56 |
+
lang.replace("_", "-") + r":][^\n]*)", r"\1\2", prg)
|
57 |
+
matchline = re.compile("^\s*(\"[^\"]*\"|[^\s]*)\s*(.*[^\s])?\s*$")
|
58 |
+
prefix = ""
|
59 |
+
for s in re.sub("(#[^\n]*)?(\n|$)", ";", prg).split(";"):
|
60 |
+
macro = re.match("== *(.*[^ ]?) ==", s)
|
61 |
+
if macro != None:
|
62 |
+
prefix = macro.group(1)
|
63 |
+
continue
|
64 |
+
m = matchline.match(s)
|
65 |
+
if prefix != "" and s != "" and m != None:
|
66 |
+
s = m.group(1).strip("\"")
|
67 |
+
space = " " if s != "" else ""
|
68 |
+
caret = ""
|
69 |
+
if s[0:1] == "^":
|
70 |
+
s = s[1:]
|
71 |
+
caret = "^"
|
72 |
+
s2 = m.group(2) if m.group(2) != None else ""
|
73 |
+
s = "\"" + caret + prefix + space + s + "\" " + s2
|
74 |
+
m = matchline.match(s)
|
75 |
+
if m != None:
|
76 |
+
s = _tr(m.group(1).strip("\""), _c[1:4], _m[1:4], "") \
|
77 |
+
.replace(_c[_m.find("\\")], "\\\\") # -> \\, ", ;, #
|
78 |
+
if m.group(2) != None:
|
79 |
+
s2 = m.group(2).strip("\"")
|
80 |
+
else:
|
81 |
+
s2 = ""
|
82 |
+
# \$, \(, \), \|, \[, \] -> \uE004..\uE009
|
83 |
+
s2 = _tr(s2, _m[4:], _c[4:], "\\")
|
84 |
+
# call inner separator: [ ... $1 ... ] -> $(\uE00A ... \uE00A$1\uE00A ... )
|
85 |
+
s2 = re.sub(r"[\[]\$(\d\d?|\([^\)]+\))",
|
86 |
+
u"$(\uE00A\uE00A|$\\1\uE00A", s2)
|
87 |
+
s2 = re.sub(r"[\[]([^\$[\\]*)\$(\d\d?|\([^\)]+\))",
|
88 |
+
u"$(\uE00A\\1\uE00A$\\2\uE00A", s2)
|
89 |
+
# add "|" in terminating position
|
90 |
+
s2 = re.sub(r"\uE00A]$", "|\uE00A)", s2)
|
91 |
+
s2 = re.sub(r"]", ")", s2)
|
92 |
+
s2 = re.sub(r"(\$\d|\))\|\$", r"\1||$",
|
93 |
+
s2) # $()|$() -> $()||$()
|
94 |
+
# \uE000..\uE003-> \, ", ;, #
|
95 |
+
s2 = _tr(s2, _c[:4], _m[:4], "")
|
96 |
+
# $, (, ), | -> \uE000..\uE003
|
97 |
+
s2 = _tr(s2, _m[4:8], _c[:4], "")
|
98 |
+
# \uE004..\uE009 -> $, (, ), |, [, ]
|
99 |
+
s2 = _tr(s2, _c[4:], _m[4:], "")
|
100 |
+
s2 = re.sub(r"\\(\d)", r"\\g<\1>",
|
101 |
+
re.sub(r"\uE000(\d)", "\uE000\uE001\\\\g<\\1>\uE002", s2))
|
102 |
+
try:
|
103 |
+
self.lines = self.lines + [[
|
104 |
+
re.compile("^" + s.lstrip("^").rstrip("$") + "$"),
|
105 |
+
s2, s[:1] == "^", s[-1:] == "$"]]
|
106 |
+
except:
|
107 |
+
print("Error in following regex line: " + s, file=sys.stderr)
|
108 |
+
raise
|
109 |
+
|
110 |
+
def run(self, data):
|
111 |
+
return self._run(data, True, True)
|
112 |
+
|
113 |
+
def _run(self, data, begin, end):
|
114 |
+
for i in self.lines:
|
115 |
+
if not ((begin == False and i[2]) or (end == False and i[3])):
|
116 |
+
m = i[0].match(data)
|
117 |
+
if m:
|
118 |
+
try:
|
119 |
+
s = m.expand(i[1])
|
120 |
+
except:
|
121 |
+
print("Error for the following input: " +
|
122 |
+
data, file=sys.stderr)
|
123 |
+
raise
|
124 |
+
n = _func.search(s)
|
125 |
+
while n:
|
126 |
+
b = False
|
127 |
+
e = False
|
128 |
+
if n.group(1)[0:1] == _pipe or n.group()[0:1] == _pipe:
|
129 |
+
b = True
|
130 |
+
elif n.start() == 0:
|
131 |
+
b = begin
|
132 |
+
if n.group(1)[-1:] == _pipe or n.group()[-1:] == _pipe:
|
133 |
+
e = True
|
134 |
+
elif n.end() == len(s):
|
135 |
+
e = end
|
136 |
+
s = s[:n.start(1)] + self._run(n.group(2),
|
137 |
+
b, e) + s[n.end(1):]
|
138 |
+
n = _func.search(s)
|
139 |
+
return s
|
140 |
+
return ""
|
text/symbols.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
'''
|
4 |
+
Defines the set of symbols used in text input to the model.
|
5 |
+
|
6 |
+
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
|
7 |
+
from text import cmudict
|
8 |
+
|
9 |
+
_pad = '_' # in principle not used in tacotron2
|
10 |
+
_punctuation = '\'!,.?…· '
|
11 |
+
_letters = 'AÀÁBCÇDEÉÈFGHIÍÏJKLMNOÓÒPQRSTUÜÚVWXYZaàábcçdeéèfghiíïjklmnoóòpqrstuüúvwxyz'
|
12 |
+
|
13 |
+
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
14 |
+
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
15 |
+
|
16 |
+
# Export all symbols:
|
17 |
+
symbols = [_pad] + list(_punctuation) + list(_letters) + _arpabet
|
text/symbols_en.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
'''
|
4 |
+
Defines the set of symbols used in text input to the model.
|
5 |
+
|
6 |
+
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
|
7 |
+
from text import cmudict
|
8 |
+
|
9 |
+
_pad = '_'
|
10 |
+
_punctuation = '!\'(),.:;? '
|
11 |
+
_special = '-'
|
12 |
+
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
13 |
+
|
14 |
+
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
15 |
+
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
16 |
+
|
17 |
+
# Export all symbols:
|
18 |
+
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "PereLluis13/wav2vec2-xls-r-300m-ca", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34861f6ec08a47ca474aba28cfc694e9303c20ab6b67ba041b792072f2a8e759
|
3 |
+
size 3055
|
vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"#": 1, "'": 2, "-": 3, "a": 4, "b": 5, "c": 6, "d": 7, "e": 8, "f": 9, "g": 10, "h": 11, "i": 12, "j": 13, "k": 14, "l": 15, "m": 16, "n": 17, "o": 18, "p": 19, "q": 20, "r": 21, "s": 22, "t": 23, "u": 24, "v": 25, "w": 26, "x": 27, "y": 28, "z": 29, "·": 30, "à": 31, "ç": 32, "è": 33, "é": 34, "í": 35, "ï": 36, "ò": 37, "ó": 38, "ú": 39, "ü": 40, "ः": 41, "|": 0, "[UNK]": 42, "[PAD]": 43}
|