guillermoruiz commited on
Commit
c025ecb
β€’
1 Parent(s): 0f2ff87

Upload TFBilma

Browse files
Files changed (2) hide show
  1. modeling_bilma.py +9 -9
  2. tf_model.h5 +1 -1
modeling_bilma.py CHANGED
@@ -1,4 +1,4 @@
1
- from transformers import TFPreTrainedModel
2
  from tensorflow.keras.models import Model, load_model, Sequential
3
  from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
4
  import tensorflow as tf
@@ -9,7 +9,7 @@ from typing import Dict
9
  import re
10
  import unicodedata
11
 
12
- from .configuration_bilma import BilmaConfig
13
 
14
  # copied from preprocessing.py
15
  BLANK = ' '
@@ -32,7 +32,7 @@ SYMBOLS = set(";:,.@\\-\"/" + SYMBOLS_)
32
 
33
  class TFBilma(TFPreTrainedModel):
34
  config_class = BilmaConfig
35
- main_input_name = "capt_input"
36
  #base_model_prefix = "bilma"
37
 
38
  def __init__(self, config):
@@ -67,7 +67,7 @@ class TFBilma(TFPreTrainedModel):
67
  @property
68
  def input_signature(self) -> Dict[str, tf.TensorSpec]:
69
  sig = {}
70
- sig["capt_input"] = tf.TensorSpec([None, self.seq_max_length], tf.int32, name="capt_input")
71
  return sig
72
 
73
 
@@ -364,7 +364,7 @@ def preprocess(text):
364
  # Copied from wordpiece_tokenizer_ex.py
365
  # -------------------------------------
366
 
367
- class Tokenizer():
368
  def __init__(self, vocab_file, unk_token="[UNK]", end_token="[END]", mask_token="[MASK]"):
369
  self.word2idx = {}
370
  self.idx2word = []
@@ -484,7 +484,7 @@ def accuracy_function(ignore_id=0):
484
  return acc_mlm
485
 
486
  def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, vocab_size=9739, rate=0.1):
487
- capt_inputs_ids = Input(shape=(max_length, ), name='capt_input')
488
  capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
489
  capt_inputs = capt_embedding(capt_inputs_ids)
490
 
@@ -503,10 +503,10 @@ def load(model_file):
503
  }
504
  return load_model(model_file, custom_objects=custom_objects)
505
 
506
- class tokenizer():
507
  def __init__(self, vocab_file, max_length):
508
- self.tokenizer = Tokenizer(vocab_file)
509
- self.emo_labels = "β€πŸ‘ŒπŸ‘πŸ’”πŸ˜„πŸ˜ŠπŸ˜ŒπŸ˜πŸ˜’πŸ˜˜πŸ˜‘πŸ˜’πŸ˜­πŸ€”πŸ₯Ί"
510
  self.max_length = max_length
511
  self.START = 2
512
  self.END = 3
 
1
+ from transformers import TFPreTrainedModel, PreTrainedTokenizer
2
  from tensorflow.keras.models import Model, load_model, Sequential
3
  from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
4
  import tensorflow as tf
 
9
  import re
10
  import unicodedata
11
 
12
+ from configuration_bilma import BilmaConfig
13
 
14
  # copied from preprocessing.py
15
  BLANK = ' '
 
32
 
33
  class TFBilma(TFPreTrainedModel):
34
  config_class = BilmaConfig
35
+ main_input_name = "input_ids"
36
  #base_model_prefix = "bilma"
37
 
38
  def __init__(self, config):
 
67
  @property
68
  def input_signature(self) -> Dict[str, tf.TensorSpec]:
69
  sig = {}
70
+ sig["input_ids"] = tf.TensorSpec([None, self.seq_max_length], tf.int32, name="input_ids")
71
  return sig
72
 
73
 
 
364
  # Copied from wordpiece_tokenizer_ex.py
365
  # -------------------------------------
366
 
367
+ class BaseTokenizer():
368
  def __init__(self, vocab_file, unk_token="[UNK]", end_token="[END]", mask_token="[MASK]"):
369
  self.word2idx = {}
370
  self.idx2word = []
 
484
  return acc_mlm
485
 
486
  def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, vocab_size=9739, rate=0.1):
487
+ capt_inputs_ids = Input(shape=(max_length, ), name='input_ids')
488
  capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
489
  capt_inputs = capt_embedding(capt_inputs_ids)
490
 
 
503
  }
504
  return load_model(model_file, custom_objects=custom_objects)
505
 
506
+ class BilmaTokenizer():
507
  def __init__(self, vocab_file, max_length):
508
+ self.tokenizer = BaseTokenizer(vocab_file)
509
+ #self.emo_labels = "β€πŸ‘ŒπŸ‘πŸ’”πŸ˜„πŸ˜ŠπŸ˜ŒπŸ˜πŸ˜’πŸ˜˜πŸ˜‘πŸ˜’πŸ˜­πŸ€”πŸ₯Ί"
510
  self.max_length = max_length
511
  self.START = 2
512
  self.END = 3
tf_model.h5 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b66af189fde956eb4a944a6473178c837e1e3616230fc6049a11ed1c1b38379
3
  size 156564220
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28582d643f857938d54653b1eb5481a6f4a8d68d0909b1af58371c55806e9048
3
  size 156564220