esnya commited on
Commit
87cf786
1 Parent(s): 8b0c89d

speecht5_tts_jvs_ver1_e20_openjtalk_longer_20230809-031157_tokenizer

Browse files
Files changed (1) hide show
  1. speecht5_openjtalk_tokenizer.py +96 -0
speecht5_openjtalk_tokenizer.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ import re
4
+ from transformers import SpeechT5Tokenizer
5
+ from transformers.models.speecht5.tokenization_speecht5 import (
6
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
7
+ )
8
+ from itertools import chain
9
+ from typing import List, Optional
10
+
11
+
12
+ def _g2p_with_np(text: str, np_lsit: str) -> List[str]:
13
+ from pyopenjtalk import g2p
14
+
15
+ np_pattern = re.compile(f"([{re.escape(np_lsit)}])")
16
+
17
+ return list(
18
+ chain.from_iterable(
19
+ [
20
+ (text,) if text in np_lsit else g2p(text, kana=False, join=False)
21
+ for text in np_pattern.split(text)
22
+ if len(text) > 0
23
+ ]
24
+ )
25
+ )
26
+
27
+
28
+ NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./ !”#$%&’()=~|`{+*}<>?_ー^¥@「;:」、。・`"
29
+
30
+
31
+ class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
32
+ vocab_files_names = {"vocab_file": "spm_char.model"}
33
+ pretrained_vocab_files_map = {}
34
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
35
+ model_input_names = ["input_ids", "attention_mask"]
36
+ label2id = {}
37
+
38
+ def __init__(
39
+ self,
40
+ vocab_file,
41
+ bos_token: str = "<s>",
42
+ eos_token: str = "</s>",
43
+ unk_token: str = "<unk>",
44
+ pad_token: str = "<pad>",
45
+ non_phenome_characters: str = NP_CHARCTERS,
46
+ **kwargs,
47
+ ):
48
+ try:
49
+ super().__init__(
50
+ vocab_file=None,
51
+ bos_token=bos_token,
52
+ eos_token=eos_token,
53
+ unk_token=unk_token,
54
+ pad_token=pad_token,
55
+ **kwargs,
56
+ )
57
+ except TypeError:
58
+ pass
59
+
60
+ self.non_phenome_characters = non_phenome_characters
61
+
62
+ if isinstance(vocab_file, str) and vocab_file.endswith(".json"):
63
+ with open(vocab_file, encoding="utf-8") as f:
64
+ self.label2id = json.load(f)
65
+ self.id2label = {v: k for k, v in self.label2id.items()}
66
+
67
+ @property
68
+ def bos_token_id(self) -> int | None:
69
+ return super().bos_token_id
70
+
71
+ @property
72
+ def vocab_size(self):
73
+ return len(self.label2id)
74
+
75
+ def get_vocab(self):
76
+ return self.label2id
77
+
78
+ def save_vocabulary(
79
+ self, save_directory: str, filename_prefix: Optional[str] = None
80
+ ):
81
+ if filename_prefix is None:
82
+ filename_prefix = ".json"
83
+ vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
84
+ vocab_path.parent.mkdir(parents=True, exist_ok=True)
85
+ with open(vocab_path, "w", encoding="utf-8") as f:
86
+ json.dump(self.label2id, f, ensure_ascii=False, indent=2)
87
+ return str(vocab_path), None
88
+
89
+ def _tokenize(self, text: str) -> List[str]:
90
+ return _g2p_with_np(text, self.non_phenome_characters)
91
+
92
+ def _convert_token_to_id(self, token):
93
+ return self.label2id.get(token, self.label2id.get(self.unk_token))
94
+
95
+ def _convert_id_to_token(self, index):
96
+ return self.id2label.get(index, self.unk_token)