iioSnail commited on
Commit
3228023
1 Parent(s): fdd8bb9

Upload 2 files

Browse files
Files changed (2) hide show
  1. bert_tokenizer.py +15 -6
  2. modeling_glycebert.py +17 -8
bert_tokenizer.py CHANGED
@@ -1,12 +1,13 @@
1
  import json
2
  import os
 
3
  import time
4
  from pathlib import Path
5
- from types import NoneType
6
  from typing import List, Union, Optional
7
 
8
  import tokenizers
9
  import torch
 
10
  from huggingface_hub import hf_hub_download
11
  from huggingface_hub.file_download import http_user_agent
12
  from pypinyin import pinyin, Style
@@ -24,10 +25,14 @@ from transformers import BertTokenizerFast, BatchEncoding
24
  cache_path = Path(os.path.abspath(__file__)).parent
25
 
26
 
27
- def download_file(filename: str):
28
  if os.path.exists(cache_path / filename):
29
  return
30
 
 
 
 
 
31
  hf_hub_download(
32
  "iioSnail/ChineseBERT-base",
33
  filename,
@@ -42,25 +47,29 @@ class ChineseBertTokenizer(BertTokenizerFast):
42
  def __init__(self, **kwargs):
43
  super(ChineseBertTokenizer, self).__init__(**kwargs)
44
 
 
45
  vocab_file = cache_path / 'vocab.txt'
46
  config_path = cache_path / 'config'
 
 
 
47
  self.max_length = 512
48
 
49
- download_file('vocab.txt')
50
  self.tokenizer = BertWordPieceTokenizer(str(vocab_file))
51
 
52
  # load pinyin map dict
53
- download_file('config/pinyin_map.json')
54
  with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
55
  self.pinyin_dict = json.load(fin)
56
 
57
  # load char id map tensor
58
- download_file('config/id2pinyin.json')
59
  with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
60
  self.id2pinyin = json.load(fin)
61
 
62
  # load pinyin map tensor
63
- download_file('config/pinyin2tensor.json')
64
  with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
65
  self.pinyin2tensor = json.load(fin)
66
 
 
1
  import json
2
  import os
3
+ import shutil
4
  import time
5
  from pathlib import Path
 
6
  from typing import List, Union, Optional
7
 
8
  import tokenizers
9
  import torch
10
+ from torch import NoneType
11
  from huggingface_hub import hf_hub_download
12
  from huggingface_hub.file_download import http_user_agent
13
  from pypinyin import pinyin, Style
 
25
  cache_path = Path(os.path.abspath(__file__)).parent
26
 
27
 
28
+ def download_file(filename: str, path: Path):
29
  if os.path.exists(cache_path / filename):
30
  return
31
 
32
+ if os.path.exists(path / filename):
33
+ shutil.copyfile(path / filename, cache_path / filename)
34
+ return
35
+
36
  hf_hub_download(
37
  "iioSnail/ChineseBERT-base",
38
  filename,
 
47
  def __init__(self, **kwargs):
48
  super(ChineseBertTokenizer, self).__init__(**kwargs)
49
 
50
+ self.path = Path(kwargs['name_or_path'])
51
  vocab_file = cache_path / 'vocab.txt'
52
  config_path = cache_path / 'config'
53
+ if not os.path.exists(config_path):
54
+ os.makedirs(config_path)
55
+
56
  self.max_length = 512
57
 
58
+ download_file('vocab.txt', self.path)
59
  self.tokenizer = BertWordPieceTokenizer(str(vocab_file))
60
 
61
  # load pinyin map dict
62
+ download_file('config/pinyin_map.json', self.path)
63
  with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
64
  self.pinyin_dict = json.load(fin)
65
 
66
  # load char id map tensor
67
+ download_file('config/id2pinyin.json', self.path)
68
  with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
69
  self.id2pinyin = json.load(fin)
70
 
71
  # load pinyin map tensor
72
+ download_file('config/pinyin2tensor.json', self.path)
73
  with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
74
  self.pinyin2tensor = json.load(fin)
75
 
modeling_glycebert.py CHANGED
@@ -10,6 +10,7 @@
10
  """
11
  import json
12
  import os
 
13
  import time
14
  import warnings
15
  from pathlib import Path
@@ -32,14 +33,17 @@ except:
32
  from transformers.modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput, \
33
  QuestionAnsweringModelOutput, TokenClassifierOutput
34
 
35
-
36
  cache_path = Path(os.path.abspath(__file__)).parent
37
 
38
 
39
- def download_file(filename: str):
40
  if os.path.exists(cache_path / filename):
41
  return
42
 
 
 
 
 
43
  hf_hub_download(
44
  "iioSnail/ChineseBERT-base",
45
  filename,
@@ -565,18 +569,22 @@ class FusionBertEmbeddings(nn.Module):
565
 
566
  def __init__(self, config):
567
  super(FusionBertEmbeddings, self).__init__()
 
568
  config_path = cache_path / 'config'
 
 
 
569
  font_files = []
570
- download_file("config/STFANGSO.TTF24.npy")
571
- download_file("config/STXINGKA.TTF24.npy")
572
- download_file("config/方正古隶繁体.ttf24.npy")
573
  for file in os.listdir(config_path):
574
  if file.endswith(".npy"):
575
  font_files.append(str(config_path / file))
576
  self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
577
  self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
578
  self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
579
- self.pinyin_embeddings = PinyinEmbedding(embedding_size=128, pinyin_out_dim=config.hidden_size)
580
  self.glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)
581
 
582
  # self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
@@ -624,7 +632,8 @@ class FusionBertEmbeddings(nn.Module):
624
 
625
 
626
  class PinyinEmbedding(nn.Module):
627
- def __init__(self, embedding_size: int, pinyin_out_dim: int):
 
628
  """
629
  Pinyin Embedding Module
630
  Args:
@@ -632,7 +641,7 @@ class PinyinEmbedding(nn.Module):
632
  pinyin_out_dim: kernel number of conv
633
  """
634
  super(PinyinEmbedding, self).__init__()
635
- download_file('config/pinyin_map.json')
636
  with open(cache_path / 'config' / 'pinyin_map.json') as fin:
637
  pinyin_dict = json.load(fin)
638
  self.pinyin_out_dim = pinyin_out_dim
 
10
  """
11
  import json
12
  import os
13
+ import shutil
14
  import time
15
  import warnings
16
  from pathlib import Path
 
33
  from transformers.modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput, \
34
  QuestionAnsweringModelOutput, TokenClassifierOutput
35
 
 
36
  cache_path = Path(os.path.abspath(__file__)).parent
37
 
38
 
39
+ def download_file(filename: str, path: Path):
40
  if os.path.exists(cache_path / filename):
41
  return
42
 
43
+ if os.path.exists(path / filename):
44
+ shutil.copyfile(path / filename, cache_path / filename)
45
+ return
46
+
47
  hf_hub_download(
48
  "iioSnail/ChineseBERT-base",
49
  filename,
 
569
 
570
  def __init__(self, config):
571
  super(FusionBertEmbeddings, self).__init__()
572
+ self.path = Path(config._name_or_path)
573
  config_path = cache_path / 'config'
574
+ if not os.path.exists(config_path):
575
+ os.makedirs(config_path)
576
+
577
  font_files = []
578
+ download_file("config/STFANGSO.TTF24.npy", self.path)
579
+ download_file("config/STXINGKA.TTF24.npy", self.path)
580
+ download_file("config/方正古隶繁体.ttf24.npy", self.path)
581
  for file in os.listdir(config_path):
582
  if file.endswith(".npy"):
583
  font_files.append(str(config_path / file))
584
  self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
585
  self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
586
  self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
587
+ self.pinyin_embeddings = PinyinEmbedding(embedding_size=128, pinyin_out_dim=config.hidden_size, config=config)
588
  self.glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)
589
 
590
  # self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
 
632
 
633
 
634
  class PinyinEmbedding(nn.Module):
635
+
636
+ def __init__(self, embedding_size: int, pinyin_out_dim: int, config):
637
  """
638
  Pinyin Embedding Module
639
  Args:
 
641
  pinyin_out_dim: kernel number of conv
642
  """
643
  super(PinyinEmbedding, self).__init__()
644
+ download_file('config/pinyin_map.json', Path(config._name_or_path))
645
  with open(cache_path / 'config' / 'pinyin_map.json') as fin:
646
  pinyin_dict = json.load(fin)
647
  self.pinyin_out_dim = pinyin_out_dim