Upload 2 files
Browse files- bert_tokenizer.py +15 -6
- modeling_glycebert.py +17 -8
bert_tokenizer.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
import time
|
4 |
from pathlib import Path
|
5 |
-
from types import NoneType
|
6 |
from typing import List, Union, Optional
|
7 |
|
8 |
import tokenizers
|
9 |
import torch
|
|
|
10 |
from huggingface_hub import hf_hub_download
|
11 |
from huggingface_hub.file_download import http_user_agent
|
12 |
from pypinyin import pinyin, Style
|
@@ -24,10 +25,14 @@ from transformers import BertTokenizerFast, BatchEncoding
|
|
24 |
cache_path = Path(os.path.abspath(__file__)).parent
|
25 |
|
26 |
|
27 |
-
def download_file(filename: str):
|
28 |
if os.path.exists(cache_path / filename):
|
29 |
return
|
30 |
|
|
|
|
|
|
|
|
|
31 |
hf_hub_download(
|
32 |
"iioSnail/ChineseBERT-base",
|
33 |
filename,
|
@@ -42,25 +47,29 @@ class ChineseBertTokenizer(BertTokenizerFast):
|
|
42 |
def __init__(self, **kwargs):
|
43 |
super(ChineseBertTokenizer, self).__init__(**kwargs)
|
44 |
|
|
|
45 |
vocab_file = cache_path / 'vocab.txt'
|
46 |
config_path = cache_path / 'config'
|
|
|
|
|
|
|
47 |
self.max_length = 512
|
48 |
|
49 |
-
download_file('vocab.txt')
|
50 |
self.tokenizer = BertWordPieceTokenizer(str(vocab_file))
|
51 |
|
52 |
# load pinyin map dict
|
53 |
-
download_file('config/pinyin_map.json')
|
54 |
with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
|
55 |
self.pinyin_dict = json.load(fin)
|
56 |
|
57 |
# load char id map tensor
|
58 |
-
download_file('config/id2pinyin.json')
|
59 |
with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
|
60 |
self.id2pinyin = json.load(fin)
|
61 |
|
62 |
# load pinyin map tensor
|
63 |
-
download_file('config/pinyin2tensor.json')
|
64 |
with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
|
65 |
self.pinyin2tensor = json.load(fin)
|
66 |
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import shutil
|
4 |
import time
|
5 |
from pathlib import Path
|
|
|
6 |
from typing import List, Union, Optional
|
7 |
|
8 |
import tokenizers
|
9 |
import torch
|
10 |
+
from torch import NoneType
|
11 |
from huggingface_hub import hf_hub_download
|
12 |
from huggingface_hub.file_download import http_user_agent
|
13 |
from pypinyin import pinyin, Style
|
|
|
25 |
cache_path = Path(os.path.abspath(__file__)).parent
|
26 |
|
27 |
|
28 |
+
def download_file(filename: str, path: Path):
|
29 |
if os.path.exists(cache_path / filename):
|
30 |
return
|
31 |
|
32 |
+
if os.path.exists(path / filename):
|
33 |
+
shutil.copyfile(path / filename, cache_path / filename)
|
34 |
+
return
|
35 |
+
|
36 |
hf_hub_download(
|
37 |
"iioSnail/ChineseBERT-base",
|
38 |
filename,
|
|
|
47 |
def __init__(self, **kwargs):
|
48 |
super(ChineseBertTokenizer, self).__init__(**kwargs)
|
49 |
|
50 |
+
self.path = Path(kwargs['name_or_path'])
|
51 |
vocab_file = cache_path / 'vocab.txt'
|
52 |
config_path = cache_path / 'config'
|
53 |
+
if not os.path.exists(config_path):
|
54 |
+
os.makedirs(config_path)
|
55 |
+
|
56 |
self.max_length = 512
|
57 |
|
58 |
+
download_file('vocab.txt', self.path)
|
59 |
self.tokenizer = BertWordPieceTokenizer(str(vocab_file))
|
60 |
|
61 |
# load pinyin map dict
|
62 |
+
download_file('config/pinyin_map.json', self.path)
|
63 |
with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
|
64 |
self.pinyin_dict = json.load(fin)
|
65 |
|
66 |
# load char id map tensor
|
67 |
+
download_file('config/id2pinyin.json', self.path)
|
68 |
with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
|
69 |
self.id2pinyin = json.load(fin)
|
70 |
|
71 |
# load pinyin map tensor
|
72 |
+
download_file('config/pinyin2tensor.json', self.path)
|
73 |
with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
|
74 |
self.pinyin2tensor = json.load(fin)
|
75 |
|
modeling_glycebert.py
CHANGED
@@ -10,6 +10,7 @@
|
|
10 |
"""
|
11 |
import json
|
12 |
import os
|
|
|
13 |
import time
|
14 |
import warnings
|
15 |
from pathlib import Path
|
@@ -32,14 +33,17 @@ except:
|
|
32 |
from transformers.modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput, \
|
33 |
QuestionAnsweringModelOutput, TokenClassifierOutput
|
34 |
|
35 |
-
|
36 |
cache_path = Path(os.path.abspath(__file__)).parent
|
37 |
|
38 |
|
39 |
-
def download_file(filename: str):
|
40 |
if os.path.exists(cache_path / filename):
|
41 |
return
|
42 |
|
|
|
|
|
|
|
|
|
43 |
hf_hub_download(
|
44 |
"iioSnail/ChineseBERT-base",
|
45 |
filename,
|
@@ -565,18 +569,22 @@ class FusionBertEmbeddings(nn.Module):
|
|
565 |
|
566 |
def __init__(self, config):
|
567 |
super(FusionBertEmbeddings, self).__init__()
|
|
|
568 |
config_path = cache_path / 'config'
|
|
|
|
|
|
|
569 |
font_files = []
|
570 |
-
download_file("config/STFANGSO.TTF24.npy")
|
571 |
-
download_file("config/STXINGKA.TTF24.npy")
|
572 |
-
download_file("config/方正古隶繁体.ttf24.npy")
|
573 |
for file in os.listdir(config_path):
|
574 |
if file.endswith(".npy"):
|
575 |
font_files.append(str(config_path / file))
|
576 |
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
|
577 |
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
578 |
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
579 |
-
self.pinyin_embeddings = PinyinEmbedding(embedding_size=128, pinyin_out_dim=config.hidden_size)
|
580 |
self.glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)
|
581 |
|
582 |
# self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
|
@@ -624,7 +632,8 @@ class FusionBertEmbeddings(nn.Module):
|
|
624 |
|
625 |
|
626 |
class PinyinEmbedding(nn.Module):
|
627 |
-
|
|
|
628 |
"""
|
629 |
Pinyin Embedding Module
|
630 |
Args:
|
@@ -632,7 +641,7 @@ class PinyinEmbedding(nn.Module):
|
|
632 |
pinyin_out_dim: kernel number of conv
|
633 |
"""
|
634 |
super(PinyinEmbedding, self).__init__()
|
635 |
-
download_file('config/pinyin_map.json')
|
636 |
with open(cache_path / 'config' / 'pinyin_map.json') as fin:
|
637 |
pinyin_dict = json.load(fin)
|
638 |
self.pinyin_out_dim = pinyin_out_dim
|
|
|
10 |
"""
|
11 |
import json
|
12 |
import os
|
13 |
+
import shutil
|
14 |
import time
|
15 |
import warnings
|
16 |
from pathlib import Path
|
|
|
33 |
from transformers.modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput, \
|
34 |
QuestionAnsweringModelOutput, TokenClassifierOutput
|
35 |
|
|
|
36 |
cache_path = Path(os.path.abspath(__file__)).parent
|
37 |
|
38 |
|
39 |
+
def download_file(filename: str, path: Path):
|
40 |
if os.path.exists(cache_path / filename):
|
41 |
return
|
42 |
|
43 |
+
if os.path.exists(path / filename):
|
44 |
+
shutil.copyfile(path / filename, cache_path / filename)
|
45 |
+
return
|
46 |
+
|
47 |
hf_hub_download(
|
48 |
"iioSnail/ChineseBERT-base",
|
49 |
filename,
|
|
|
569 |
|
570 |
def __init__(self, config):
|
571 |
super(FusionBertEmbeddings, self).__init__()
|
572 |
+
self.path = Path(config._name_or_path)
|
573 |
config_path = cache_path / 'config'
|
574 |
+
if not os.path.exists(config_path):
|
575 |
+
os.makedirs(config_path)
|
576 |
+
|
577 |
font_files = []
|
578 |
+
download_file("config/STFANGSO.TTF24.npy", self.path)
|
579 |
+
download_file("config/STXINGKA.TTF24.npy", self.path)
|
580 |
+
download_file("config/方正古隶繁体.ttf24.npy", self.path)
|
581 |
for file in os.listdir(config_path):
|
582 |
if file.endswith(".npy"):
|
583 |
font_files.append(str(config_path / file))
|
584 |
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
|
585 |
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
586 |
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
587 |
+
self.pinyin_embeddings = PinyinEmbedding(embedding_size=128, pinyin_out_dim=config.hidden_size, config=config)
|
588 |
self.glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)
|
589 |
|
590 |
# self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
|
|
|
632 |
|
633 |
|
634 |
class PinyinEmbedding(nn.Module):
|
635 |
+
|
636 |
+
def __init__(self, embedding_size: int, pinyin_out_dim: int, config):
|
637 |
"""
|
638 |
Pinyin Embedding Module
|
639 |
Args:
|
|
|
641 |
pinyin_out_dim: kernel number of conv
|
642 |
"""
|
643 |
super(PinyinEmbedding, self).__init__()
|
644 |
+
download_file('config/pinyin_map.json', Path(config._name_or_path))
|
645 |
with open(cache_path / 'config' / 'pinyin_map.json') as fin:
|
646 |
pinyin_dict = json.load(fin)
|
647 |
self.pinyin_out_dim = pinyin_out_dim
|