Commit
β’
c025ecb
1
Parent(s):
0f2ff87
Upload TFBilma
Browse files- modeling_bilma.py +9 -9
- tf_model.h5 +1 -1
modeling_bilma.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from transformers import TFPreTrainedModel
|
2 |
from tensorflow.keras.models import Model, load_model, Sequential
|
3 |
from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
|
4 |
import tensorflow as tf
|
@@ -9,7 +9,7 @@ from typing import Dict
|
|
9 |
import re
|
10 |
import unicodedata
|
11 |
|
12 |
-
from
|
13 |
|
14 |
# copied from preprocessing.py
|
15 |
BLANK = ' '
|
@@ -32,7 +32,7 @@ SYMBOLS = set(";:,.@\\-\"/" + SYMBOLS_)
|
|
32 |
|
33 |
class TFBilma(TFPreTrainedModel):
|
34 |
config_class = BilmaConfig
|
35 |
-
main_input_name = "
|
36 |
#base_model_prefix = "bilma"
|
37 |
|
38 |
def __init__(self, config):
|
@@ -67,7 +67,7 @@ class TFBilma(TFPreTrainedModel):
|
|
67 |
@property
|
68 |
def input_signature(self) -> Dict[str, tf.TensorSpec]:
|
69 |
sig = {}
|
70 |
-
sig["
|
71 |
return sig
|
72 |
|
73 |
|
@@ -364,7 +364,7 @@ def preprocess(text):
|
|
364 |
# Copied from wordpiece_tokenizer_ex.py
|
365 |
# -------------------------------------
|
366 |
|
367 |
-
class
|
368 |
def __init__(self, vocab_file, unk_token="[UNK]", end_token="[END]", mask_token="[MASK]"):
|
369 |
self.word2idx = {}
|
370 |
self.idx2word = []
|
@@ -484,7 +484,7 @@ def accuracy_function(ignore_id=0):
|
|
484 |
return acc_mlm
|
485 |
|
486 |
def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, vocab_size=9739, rate=0.1):
|
487 |
-
capt_inputs_ids = Input(shape=(max_length, ), name='
|
488 |
capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
|
489 |
capt_inputs = capt_embedding(capt_inputs_ids)
|
490 |
|
@@ -503,10 +503,10 @@ def load(model_file):
|
|
503 |
}
|
504 |
return load_model(model_file, custom_objects=custom_objects)
|
505 |
|
506 |
-
class
|
507 |
def __init__(self, vocab_file, max_length):
|
508 |
-
self.tokenizer =
|
509 |
-
self.emo_labels = "β€ππππππππππ‘π’ππ€π₯Ί"
|
510 |
self.max_length = max_length
|
511 |
self.START = 2
|
512 |
self.END = 3
|
|
|
1 |
+
from transformers import TFPreTrainedModel, PreTrainedTokenizer
|
2 |
from tensorflow.keras.models import Model, load_model, Sequential
|
3 |
from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
|
4 |
import tensorflow as tf
|
|
|
9 |
import re
|
10 |
import unicodedata
|
11 |
|
12 |
+
from configuration_bilma import BilmaConfig
|
13 |
|
14 |
# copied from preprocessing.py
|
15 |
BLANK = ' '
|
|
|
32 |
|
33 |
class TFBilma(TFPreTrainedModel):
|
34 |
config_class = BilmaConfig
|
35 |
+
main_input_name = "input_ids"
|
36 |
#base_model_prefix = "bilma"
|
37 |
|
38 |
def __init__(self, config):
|
|
|
67 |
@property
|
68 |
def input_signature(self) -> Dict[str, tf.TensorSpec]:
|
69 |
sig = {}
|
70 |
+
sig["input_ids"] = tf.TensorSpec([None, self.seq_max_length], tf.int32, name="input_ids")
|
71 |
return sig
|
72 |
|
73 |
|
|
|
364 |
# Copied from wordpiece_tokenizer_ex.py
|
365 |
# -------------------------------------
|
366 |
|
367 |
+
class BaseTokenizer():
|
368 |
def __init__(self, vocab_file, unk_token="[UNK]", end_token="[END]", mask_token="[MASK]"):
|
369 |
self.word2idx = {}
|
370 |
self.idx2word = []
|
|
|
484 |
return acc_mlm
|
485 |
|
486 |
def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, vocab_size=9739, rate=0.1):
|
487 |
+
capt_inputs_ids = Input(shape=(max_length, ), name='input_ids')
|
488 |
capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
|
489 |
capt_inputs = capt_embedding(capt_inputs_ids)
|
490 |
|
|
|
503 |
}
|
504 |
return load_model(model_file, custom_objects=custom_objects)
|
505 |
|
506 |
+
class BilmaTokenizer():
|
507 |
def __init__(self, vocab_file, max_length):
|
508 |
+
self.tokenizer = BaseTokenizer(vocab_file)
|
509 |
+
#self.emo_labels = "β€ππππππππππ‘π’ππ€π₯Ί"
|
510 |
self.max_length = max_length
|
511 |
self.START = 2
|
512 |
self.END = 3
|
tf_model.h5
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 156564220
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28582d643f857938d54653b1eb5481a6f4a8d68d0909b1af58371c55806e9048
|
3 |
size 156564220
|